為什麼 C++ 只比 VBA 快 4倍?

用C++和VBA分別寫了一段用隨機數字測試pi 的值的演算法,用500萬個隨機數,發現VBA需要570毫秒,而C++則需要150毫秒。這也太讓人失望了把,C++寫起來那麼麻煩,起碼也要快個幾個數量級吧?python就更別說了,居然要7秒鐘。求各位大神幫我看看是不是我的代碼沒優化好?

(平台是Visual Studio,已經開了Release模式)

計算方法:生成a,b兩個在0 到 r 之間的隨機數,數一數這些數字裡面有多少個落在了半徑為r的1/4圓的扇形裡面,用這個數字代表扇形面積,用總隨機數數量代表正方形面積。因為扇形的面積是  frac{pi r^2}{4} ,而正方形的面積是r^2, 可得知pi =扇形面積/正方形面積 * 4.(蒙地卡羅方法)

C++代碼:

#include "stdafx.h"
#include &
#include &

using namespace std;

void main()
{
double st = clock();
double rand_max = 32767;
srand((int)time(0));
unsigned int simulate_total = 2500000;
unsigned int inside_count = 0;
unsigned int radius = rand_max * rand_max;

unsigned int randA;
unsigned int randB;

unsigned int randA_opp;
unsigned int randB_opp;

for (unsigned int i = 1; i &< simulate_total; i++){ randA = rand(); randB = rand(); if ((randA * randA + randB * randB) &< radius){ inside_count++; } randA_opp = rand_max - randA; randB_opp = rand_max - randB; if ((randA_opp * randA_opp + randB_opp * randB_opp) &< radius){ inside_count++; } } cout &<&< inside_count / double(simulate_total) * 2 &<&< endl; cout &<&< clock() - st &<&< endl; }

VBA代碼:

Sub simulate_pi()
Dim area_count As Double: area_count = 0
Dim simulate_count As Double: simulate_count = 2500000

Dim i As Double
Dim randA As Double, randB As Double
Dim randA_opp As Double, randB_opp As Double
For i = 1 To simulate_count
randA = Math.Rnd()
randB = Math.Rnd()

randA_opp = 1 - randA
randB_opp = 1 - randB

If randA * randA + randB * randB &< 1 Then area_count = area_count + 1 End If If randA_opp * randA_opp + randB_opp * randB_opp &< 1 Then area_count = area_count + 1 End If Next i Debug.Print "Estimate: ", area_count / simulate_count * 2 End Sub "VBA內測時間的方法: "新建一個module,把以下代碼複製進去,然後運行test. Option Explicit Private Declare Function getFrequency Lib "kernel32" _ Alias "QueryPerformanceFrequency" (cyFrequency As Currency) As Long Private Declare Function getTickCount Lib "kernel32" _ Alias "QueryPerformanceCounter" (cyTickCount As Currency) As Long " Function MicroTimer() As Double " " Returns seconds. " Dim cyTicks1 As Currency Static cyFrequency As Currency " MicroTimer = 0 " Get frequency. If cyFrequency = 0 Then getFrequency cyFrequency " Get ticks. getTickCount cyTicks1 " Seconds If cyFrequency Then MicroTimer = cyTicks1 / cyFrequency End Function Sub test() Dim st: st = MicroTimer Call simulate_pi Debug.Print (MicroTimer - st) * 1000 End Sub

Python代碼:

#!/usr/bin/python
# Filename : pi_simulate.py

import random
import time

time_start = time.perf_counter()

inside_count = 0
simulate_total = 2500000

randA = 0
randB = 0

for i in range(1, simulate_total):
randA = random.uniform(0,1)
randB = random.uniform(0,1)
if ((randA * randA + randB * randB) &< 1): inside_count = inside_count + 1 randA_opp = 1 - randA randB_opp = 1 - randB if ((randA_opp * randA_opp + randB_opp * randB_opp) &< 1): inside_count = inside_count + 1 print (inside_count/simulate_total*2) print ("Time spent", time.perf_counter() - time_start)


C++和其他語言不同的地方,就是可以做底層優化,我來試一下。在我的機器上,使用VC2013 32bit:

問題原C++版本 156ms
grapeot的版本 23ms

我覺得用32位浮點數比較精確,改了一下:

// milo_float
#include &
#include &
#include &

using namespace std;

class LCG {
public:
LCG(uint32_t seed) : mSeed(seed) {}
float operator()() {
mSeed = mSeed * 214013 + 2531011;
union {
uint32_t u;
float f;
}u = { (mSeed &>&> 9) | 0x3F800000 };
return u.f - 1.0f;
}
private:
uint32_t mSeed;
};

void main()
{
LARGE_INTEGER st;
QueryPerformanceCounter(st);

const unsigned simulate_total = 2500000;
unsigned inside_count = 0;

LCG rng(0);
for (unsigned i = 1; i &< simulate_total; i++){ float a = rng(); float b = rng(); if (a * a + b * b &< 1.0f) inside_count++; a = 1.0f - a; b = 1.0f - b; if (a * a + b * b &< 1.0f) inside_count++; } LARGE_INTEGER end, freq; QueryPerformanceCounter(end); QueryPerformanceFrequency(freq); cout &<&< inside_count / double(simulate_total) * 2 &<&< endl; cout &<&< (end.QuadPart - st.QuadPart) * 1000.0 / freq.QuadPart &<&< endl; }

這個版本慢了一點,只有33ms。

然後,用SSE2 intrinsic:

// milo_SSE2
#include &
#include &
#include &
#include &

using namespace std;

static const __m128i cLCG1 = _mm_set1_epi32(214013);
static const __m128i cLCG2 = _mm_set1_epi32(2531011);
static const __m128i cLCGmask = _mm_set1_epi32(0x3F800000);
static const __m128i cOnei = _mm_set1_epi32(1);
static const __m128 cOne = _mm_set1_ps(1.0f);

inline __m128i mul_uint32(__m128i a, __m128i b) {
const __m128i tmp1 = _mm_mul_epu32(a, b);
const __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
}

class LCG {
public:
LCG(__m128i seed) : mSeed(seed) {}
__m128 operator()() {
mSeed = _mm_add_epi32(mul_uint32(mSeed, cLCG1), cLCG2);
const __m128i u = _mm_or_si128(_mm_srli_epi32(mSeed, 9), cLCGmask);
return _mm_sub_ps(_mm_castsi128_ps(u), cOne);
}
private:
__m128i mSeed;
};

void main()
{
LARGE_INTEGER st;
QueryPerformanceCounter(st);

const unsigned simulate_total = 2500000;
__m128i inside_count1 = _mm_setzero_si128();
__m128i inside_count2 = _mm_setzero_si128();

LCG rng1(_mm_setr_epi32(0, 1, 2, 3)), rng2(_mm_setr_epi32(4, 5, 6, 7));
for (unsigned i = 1; i &< simulate_total / 4; i++){ const __m128 a = rng1(); const __m128 b = rng2(); const __m128 r1 = _mm_cmplt_ps(_mm_add_ps(_mm_mul_ps(a, a), _mm_mul_ps(b, b)), cOne); inside_count1 = _mm_add_epi32(inside_count1, _mm_and_si128(_mm_castps_si128(r1), cOnei)); const __m128 c = _mm_sub_ps(cOne, a); const __m128 d = _mm_sub_ps(cOne, b); const __m128 r2 = _mm_cmplt_ps(_mm_add_ps(_mm_mul_ps(c, c), _mm_mul_ps(d, d)), cOne); inside_count2 = _mm_add_epi32(inside_count2, _mm_and_si128(_mm_castps_si128(r2), cOnei)); } unsigned inside_count = 0; for (int i = 0; i &< 4; i++) inside_count += inside_count1.m128i_u32[i] + inside_count2.m128i_u32[i]; LARGE_INTEGER end, freq; QueryPerformanceCounter(end); QueryPerformanceFrequency(freq); cout &<&< inside_count / double(simulate_total) * 2 &<&< endl; cout &<&< (end.QuadPart - st.QuadPart) * 1000.0 / freq.QuadPart &<&< endl; }

這個版本5.5ms。

然後,我們可以使用OpenMP,簡單地做並行,這裡使用4個線程:

// milo_SSE2_OpenMP
#include &

// ...
unsigned inside_count = 0;

#pragma omp parallel num_threads(4) reduction(+ : inside_count)
{
__m128i inside_count1 = _mm_setzero_si128();
__m128i inside_count2 = _mm_setzero_si128();

int j = omp_get_thread_num() * 8;
LCG rng1(_mm_setr_epi32(j + 0, j + 1, j + 2, j + 3)), rng2(_mm_setr_epi32(j + 4, j + 5, j + 6, j + 7));

for (unsigned i = 1; i &< simulate_total / 16; i++) { const __m128 a = rng1(); const __m128 b = rng2(); const __m128 r1 = _mm_cmplt_ps(_mm_add_ps(_mm_mul_ps(a, a), _mm_mul_ps(b, b)), cOne); inside_count1 = _mm_add_epi32(inside_count1, _mm_and_si128(_mm_castps_si128(r1), cOnei)); const __m128 c = _mm_sub_ps(cOne, a); const __m128 d = _mm_sub_ps(cOne, b); const __m128 r2 = _mm_cmplt_ps(_mm_add_ps(_mm_mul_ps(c, c), _mm_mul_ps(d, d)), cOne); inside_count2 = _mm_add_epi32(inside_count2, _mm_and_si128(_mm_castps_si128(r2), cOnei)); } for (int i = 0; i &< 4; i++) inside_count += inside_count1.m128i_u32[i] + inside_count2.m128i_u32[i]; }

這個版本1.7ms。

總結:

問題原C++版本 156ms
grapeot的版本 23ms
milo_float 33ms
milo_SSE2 5.5ms
milo_SSE2_OpenMP 1.7ms

「犯規」地用 SSE2 + OpenMP 後及不同的隨機數生成器,最後的版本大約為原C++版本提升90倍性能,那麼按問題所說的C++是VBA的4倍,那麼這個版本會是VBA的360倍左右。估計用AVX等更新的指令集可以再快一點。


這是一個非常好的問題。我們面對的是一個非常模糊的問題,下面主要說如何利用工具來尋找線索/原因,進而找到解決方法——這也是高級碼農的基本修養。

我首先在VS2013, Release模式下面跑了一下C++代碼,時間是208ms。基本重現了C++部分的結果。這個速度的確不能算快。在我們推測具體原因以前,一定一定要先profile,看看每一行所花費的時間是多少(不成熟的優化是萬惡之源)。在VS里這個非常簡單(要求VS Ultimate,但一般訂閱了MSDN AA的學校都有免費的VS Ultimate)。只要點一個按鈕就好了,如下圖所示。

出來的結果類似:

我們可以看到,如 @姚冬 所說,rand()佔用了56.3%的時間,是性能的瓶頸。此外,整數的減法也佔用了33.4%的時間。知道了原因以後就好辦了。第一步,放狗搜索c++ slow rand。出來一大坨結果。其中一個是

Need a fast random generator for c++

裡面提供了一個快速的rand()實現。把代碼粘進去。

unsigned int g_seed = 0;
unsigned int fastrand() {
g_seed = (214013 * g_seed + 2531011);
return (g_seed &>&> 16) 0x7FFF;
}

瞬間提速1倍到了99ms,結果不變。再profile得到:

一個詭異的地方是這是個整數運算應該極其快,而且不應該有類型轉換在裡面。但是profiler說50%的時間都在float到long的類型轉換上面(注意圖片右上角)。進一步檢查發現,rand_max怎麼是double。改成int,結果不變,時間變成43ms,進一步提速2倍。再profile得到:

這時候我們可以看到時間非常平均,也沒有明顯瓶頸了。優化結束。

考慮到我們機器的不同,我的43ms大約相當於你的機器的31ms,相比於vba的570ms有了18倍的性能差異,感覺還是比較合理的。幾點討論:

  • 其實在這個回答里,核心並不是程序優化的具體技巧,而是拿到一個問題如何思考和利用工具的通用方法。比如即使我們不知道profiler這個東西,通過搜索"代碼 每一行 時間"也可以很快知道有這樣的工具叫做profiler,並且學會怎麼使用。即使不知道rand這個函數怎麼加速,通過搜索引擎也可以找到別人寫好的現成代碼。另一方面是發現瓶頸之後也不要著急自己修復,如果不是特別一目了然的話,先看看別人是怎麼做的。站在巨人的肩膀上,事半功倍。所以關鍵在於時刻知道自己想要的是什麼,和分析-調研-實驗的思維習慣。
  • 具體關於程序優化,我們絕大多數人沒有 @姚冬 那麼牛的經驗,一眼就能看出問題在哪裡。所以遇到性能問題,第一反應應當是用profiler看看瓶頸到底在哪。而且一個經驗是這個瓶頸往往是很難猜的——比如這個例子直接看代碼第一反應往往是用代數和工程方法去優化算半徑的那部分。但就算這部分做到極致,rand速度提不上去,最多也只能把時間降到原來的一半,事倍功半。以前我寫代碼也會在寫的時候用各種奇技淫巧提升速度,但後來發現總體上程序的速度並沒有得到提升。因為程序80%的時間其實花在20%的代碼里,剩下80%的代碼就算花個兩個月優化到速度無窮快,也還是白瞎。所以一個兼顧開發和運行效率的方法是,先怎麼方便開發怎麼寫,然後用profiler找到瓶頸再有針對性地優化。
  • 前兩點不僅可以節省時間,可能更重要的是,如果你面對的不僅是一個工程,而且是老闆,你要說服老闆你這麼做的原因。這些profile的結果、別人的討論、你自己的實驗結果,都會1) 說服老闆你這麼做是對的,2) 給老闆留下深刻印象:你幹了很多事,腦子清楚。以後升遷啥的都有幫助。

  • 不懂的問題上知乎問!這也是非常重要的一部分。
  • 上面用的是Windows平台的VS,方便好用但也非常貴。如果是Linux平台下可以用gprof(不曉得有沒有GUI版本的,望指教)。Mac下可以用XCode。但基本思路都是一樣的。


你單獨計算下生成隨機數的時間就知道原因了。

我猜 rand 才是性能瓶頸,因為現在 C runtime是要考慮多線程的,所以rand里有TLS操作。

除了 @grapeot 的優化方法外

if ((randA_opp * randA_opp + randB_opp * randB_opp) &< radius){

把這個表達式做代數展開,根據 (a-b) * (a-b) = a * a - 2 * a *b + b * b 公式

rand_max * rand_max - 2 * rand_max * randA + randA * randA +
rand_max * rand_max - 2 * rand_max * randB + randB * randB

再合併同類項

rand_max * rand_max * 2 - ( 2 * rand_max - randA) * randA - ( 2 * rand_max - randB) * randB

看到了嗎?其中 rand_max是常量,可以常量化

const unsigned int rand_max = 32767;
const unsigned int rand_max_2 = rand_max * 2;
const unsigned int radius = rand_max * rand_max;
const unsigned int radius_2 = radius * 2;

radius_2 - ( rand_max_2 - randA) * randA - ( rand_max_2 - randB) * randB

因為常量會被編譯成立即數,會快一點,這樣還可以再壓榨出幾毫秒。

終極結果大概也就這樣了吧,如果只用C++的話。

還有個優化掉if的方法,但是影響可讀性了,沒必要那麼極端。


根據排名第一 @grapeot的思路,我用 Python 下的 line_profiler 工具來跑一邊題主給的代碼。

為了配合工具,我小小的修改了一下答主的代碼

import random
import time

@profile
def pi_simulate():
time_start = time.perf_counter()

inside_count = 0
simulate_total = 2500000

randA = 0
randB = 0

for i in range(1, simulate_total):
randA = random.uniform(0,1)
randB = random.uniform(0,1)
if ((randA * randA + randB * randB) &< 1): inside_count = inside_count + 1 randA_opp = 1 - randA randB_opp = 1 - randB if ((randA_opp * randA_opp + randB_opp * randB_opp) &< 1): inside_count = inside_count + 1 print(inside_count/simulate_total*2) print("Time spent", time.perf_counter() - time_start) pi_simulate()

測試的結果如下:

可以看出的確是 random.uniform(0,1) 這行影響了性能

把這句改成 random.random() 嘗試跑一遍結果如下

代碼修改前後用時前後對比

3.1404152
Time spent 3.148782492004102

3.1407416
Time spent 1.6630259380035568

節省了接近一半的時間


樓上已經有人提到了python 的性能瓶頸也是在隨機數的生成上。但除此之外,Python作為直譯語言,本身在大量loop的情況下就很吃虧,所以如果能優化一下演算法,可能跟C++對比也不會輸得那麼難看。

首先這是樓主的代碼:

def pi_orig():
inside_count = 0
simulate_total = 2500000

randA = 0
randB = 0

for i in range(1, simulate_total):
randA = random.uniform(0,1)
randB = random.uniform(0,1)
if ((randA * randA + randB * randB) &< 1): inside_count = inside_count + 1 randA_opp = 1 - randA randB_opp = 1 - randB if ((randA_opp * randA_opp + randB_opp * randB_opp) &< 1): inside_count = inside_count + 1

接下來是 @virusdefender 使用了numpy的代碼

import numpy as np
import random

def pi_calc():
inside_count = 0
simulate_total = 2500000

randA = 0
randB = 0

for i in xrange(1, simulate_total):
randA = np.random.uniform(0,1)
randB = np.random.uniform(0,1)
if ((randA * randA + randB * randB) &< 1): inside_count = inside_count + 1 randA_opp = 1 - randA randB_opp = 1 - randB if ((randA_opp * randA_opp + randB_opp * randB_opp) &< 1): inside_count = inside_count + 1 return float(inside_count)/float(simulate_total)*2

再是避免使用loop的代碼:

def pi_calc2():
simulate_total = 5000000.
pts = np.random.random((simulate_total,2))

idx = (pts**2).sum(axis=1) &< 1.0 return idx.sum()/simulate_total*4

用timeit 計時結果如下:


根據 @grapeot 和 @Milo Yip的思路,寫了一個C語言的版本。

平台:Ubuntu 14.04 LTS, gcc

原始版本:

#include&
#include&
#include &

int main()
{

struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, tstart);
int i;
int simulate_total = 1000000000;
double radius = (double) RAND_MAX;
double radius_square = radius * radius;
double x, y;
int inside_count = 0;
srand(time(NULL));
for (i = 0; i &< simulate_total;i++) { x = (double) rand(); y = (double) rand(); if (x * x + y * y &< radius_square) inside_count += 1; } printf("%f ", 4.0 * inside_count / simulate_total); clock_gettime(CLOCK_MONOTONIC, tend); printf("This computation took about %.5f seconds ", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec)); exit(0); }

10億個隨機數,用時40s

用kcachegrind進行profile,發現rand()果然是瓶頸,從網上找了一個快速rand的實現:

https://software.intel.com/en-us/articles/fast-random-number-generator-on-the-intel-pentiumr-4-processor/

#include&
#include&
#include &

unsigned int g_seed;

//Used to seed the generator.

inline void fast_srand(int seed)
{
g_seed = seed;
}

//fastrand routine returns one integer, similar output value range as C lib.
inline int fastrand()
{
g_seed = (214013*g_seed+2531011);
return (g_seed&>&>16)0x7FFF;
}

int main()
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, tstart);
int i;
int simulate_total = 1000000000;
int radius = 0x7FFF;
int radius_square = radius * radius;
int x, y;
int inside_count = 0;

fast_srand((int) time(NULL));
for (i = 0; i &< simulate_total;i++) { x = fastrand(); y = fastrand(); if (x * x + y * y &< radius_square) inside_count += 1; } printf("%f ", 4.0 * inside_count / simulate_total); clock_gettime(CLOCK_MONOTONIC, tend); printf("This computation took about %.5f seconds ", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec)); exit(0); }

這個版本用時4.2s

然後使用AVX:

#include&
#include&
#include &

typedef int v4si __attribute__ ((vector_size (16)));

int main()
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, tstart);
v4si g_seed = {(int)time(NULL),(int)time(NULL),(int)time(NULL),(int)time(NULL)};
v4si a = {214013,214013,214013,214013};
v4si b = {2531011,2531011,2531011,2531011};
v4si c = {16,16,16,16};
v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
int i;
int simulate_total = 1000000000;
int radius_s = 0x7FFF * 0x7FFF;
v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
v4si x, y;
v4si inside_count = {0,0,0,0};
int count = 0;
for (i = 0; i &< simulate_total / 4;i++) { g_seed = (a * g_seed+b); x = (g_seed &>&> c) d;
g_seed = (a * g_seed+b);
y = (g_seed &>&> c) d;
inside_count = inside_count + (x * x + y * y &< radius_square); }

這個版本用時1.4s

查看彙編:

.L3:
vpmulld xmm0, xmm0, xmm6
vpaddd xmm0, xmm0, xmm5
vpsrad xmm2, xmm0, 16
vpmulld xmm0, xmm0, xmm6
vpand xmm2, xmm2, xmm4
vpaddd xmm0, xmm0, xmm5
vpsrad xmm1, xmm0, 16
vpand xmm1, xmm1, xmm4
sub eax, 1
vpmulld xmm2, xmm2, xmm2
vpmulld xmm1, xmm1, xmm1
vpaddd xmm1, xmm2, xmm1
vpcmpgtd xmm1, xmm7, xmm1
vpaddd xmm3, xmm3, xmm1
jne .L3

循環中的運算都在寄存器進行,沒有內存讀取,用不上cache優化。

循環中所有的

g_seed = (a * g_seed+b);

形成了critical path,所以我們使用loop unrolling, 增加並行度,減少循環次數,也就是減少critical path的運行時間。

loop unrolling:

#include&
#include&
#include &
#define UNROLL 4
typedef int v4si __attribute__ ((vector_size (16)));

int main()
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, tstart);
int t1 = (int) time(NULL);
v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
int t2 = (int) time(NULL);
v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
int t3 = (int) time(NULL);
v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
int t4 = (int) time(NULL);
v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
v4si a = {214013,214013,214013,214013};
v4si b = {2531011,2531011,2531011,2531011};
v4si c = {16,16,16,16};
v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
int i;
int simulate_total = 1000000000;
int radius_s = 0x7FFF * 0x7FFF;
v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
v4si x1, y1, x2, y2, x3, y3, x4, y4;
v4si inside_count1 = {0,0,0,0};
v4si inside_count2 = {0,0,0,0};
v4si inside_count3 = {0,0,0,0};
v4si inside_count4 = {0,0,0,0};
int count = 0;
for (i = 0; i &< simulate_total / (UNROLL * 4);i++) { g_seed1 = (a * g_seed1+b); x1 = (g_seed1 &>&> c) d;
g_seed1 = (a * g_seed1+b);
y1 = (g_seed1 &>&> c) d;
inside_count1 += (x1 * x1 + y1 * y1 &< radius_square); g_seed2 = (a * g_seed2+b); x2 = (g_seed2 &>&> c) d;
g_seed2 = (a * g_seed2+b);
y2 = (g_seed2 &>&> c) d;
inside_count2 += (x2 * x2 + y2 * y2 &< radius_square); g_seed3 = (a * g_seed3+b); x3 = (g_seed3 &>&> c) d;
g_seed3 = (a * g_seed3+b);
y3 = (g_seed3 &>&> c) d;
inside_count3 += (x3 * x3 + y3 * y3 &< radius_square); g_seed4 = (a * g_seed4+b); x4 = (g_seed4 &>&> c) d;
g_seed4 = (a * g_seed4+b);
y4 = (g_seed4 &>&> c) d;
inside_count4 += (x4 * x4 + y4 * y4 &< radius_square); } for (i = 0; i &< 4; i++) count += inside_count1[i] + inside_count2[i] + inside_count3[i] + inside_count4[i]; printf("%f ", -4.0 * count / simulate_total); clock_gettime(CLOCK_MONOTONIC, tend); printf("This computation took about %.5f seconds ", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec)); exit(0); }

這個版本用時0.7s

然後使用多線程:

#include &
#include &
#include &
#include &

#define UNROLL 4
#define NUM_THREADS 4

int simulate_total = 1000000000;
typedef int v4si __attribute__ ((vector_size (16)));
v4si inside_count[NUM_THREADS] = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};

void *thread(void *argp)
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, tstart);
int index = (int) argp;
int t1 = (int) time(NULL);
v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
int t2 = (int) time(NULL);
v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
int t3 = (int) time(NULL);
v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
int t4 = (int) time(NULL);
v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
v4si a = {214013,214013,214013,214013};
v4si b = {2531011,2531011,2531011,2531011};
v4si c = {16,16,16,16};
v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
int i;
int radius_s = 0x7FFF * 0x7FFF;
v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
v4si x1, y1, x2, y2, x3, y3, x4, y4;
v4si inside_count1 = {0,0,0,0};
v4si inside_count2 = {0,0,0,0};
v4si inside_count3 = {0,0,0,0};
v4si inside_count4 = {0,0,0,0};

for (i = 0; i &< simulate_total / (UNROLL * 4 * NUM_THREADS);i++) { g_seed1 = (a * g_seed1+b); x1 = (g_seed1 &>&> c) d;
g_seed1 = (a * g_seed1+b);
y1 = (g_seed1 &>&> c) d;
inside_count1 += (x1 * x1 + y1 * y1 &< radius_square); g_seed2 = (a * g_seed2+b); x2 = (g_seed2 &>&> c) d;
g_seed2 = (a * g_seed2+b);
y2 = (g_seed2 &>&> c) d;
inside_count2 += (x2 * x2 + y2 * y2 &< radius_square); g_seed3 = (a * g_seed3+b); x3 = (g_seed3 &>&> c) d;
g_seed3 = (a * g_seed3+b);
y3 = (g_seed3 &>&> c) d;
inside_count3 += (x3 * x3 + y3 * y3 &< radius_square); g_seed4 = (a * g_seed4+b); x4 = (g_seed4 &>&> c) d;
g_seed4 = (a * g_seed4+b);
y4 = (g_seed4 &>&> c) d;
inside_count4 += (x4 * x4 + y4 * y4 &< radius_square); } inside_count[index] += inside_count1 + inside_count2 + inside_count3 + inside_count4; clock_gettime(CLOCK_MONOTONIC, tend); printf("thread %d: this computation took about %.5f seconds ", index, ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec)); } int main() { struct timespec tstart={0,0}, tend={0,0}; clock_gettime(CLOCK_MONOTONIC, tstart); v4si sum = {0, 0, 0, 0}; int count = 0; pthread_t tid[NUM_THREADS]; int i, j; for (i = 0; i &< NUM_THREADS; i++) pthread_create(tid + i, NULL, thread, (void *) i); for (i = 0; i &< NUM_THREADS; i++) pthread_join(tid[i], NULL); for (i = 0; i &< NUM_THREADS; i++) for (j = 0; j &< 4; j++) count += inside_count[i][j]; printf("%f ", -4.0 * count / simulate_total); clock_gettime(CLOCK_MONOTONIC, tend); printf("This computation took about %.5f seconds ", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec)); exit(0); }

這個版本10億個隨機數用時0.340s左右,所以500萬個隨機數的平均用時約為1.7ms, 與 @Milo Yip 結果一致。

重新看了一下問題,發現可以再做一點優化:

#include &
#include &
#include &
#include &

#define UNROLL 4
#define NUM_THREADS 4

int simulate_total = 500000000;
typedef int v4si __attribute__ ((vector_size (16)));
v4si inside_count[NUM_THREADS] = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};

void *thread(void *argp)
{
struct timespec tstart= {0,0}, tend= {0,0};
clock_gettime(CLOCK_MONOTONIC, tstart);
int index = (int) argp;
int t1 = (int) time(NULL);
v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
int t2 = (int) time(NULL);
v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
int t3 = (int) time(NULL);
v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
int t4 = (int) time(NULL);
v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
v4si a = {214013,214013,214013,214013};
v4si b = {2531011,2531011,2531011,2531011};
v4si c = {16,16,16,16};
v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
int i;
int radius = 0x7FFF;
v4si radius_square = {0x7FFF * 0x7FFF,0x7FFF * 0x7FFF,0x7FFF * 0x7FFF,0x7FFF * 0x7FFF};
v4si x1, y1, x2, y2, x3, y3, x4, y4;
v4si x1_opp, y1_opp, x2_opp, y2_opp, x3_opp, y3_opp, x4_opp, y4_opp;
v4si inside_count1[2] = {{0,0,0,0}, {0, 0, 0, 0}};
v4si inside_count2[2] = {{0,0,0,0}, {0, 0, 0, 0}};
v4si inside_count3[2] = {{0,0,0,0}, {0, 0, 0, 0}};
v4si inside_count4[2] = {{0,0,0,0}, {0, 0, 0, 0}};

for (i = 0; i &< simulate_total / (UNROLL * 4 * NUM_THREADS); i++) { g_seed1 = (a * g_seed1+b); x1 = (g_seed1 &>&> c) d;
g_seed1 = (a * g_seed1+b);
y1 = (g_seed1 &>&> c) d;
inside_count1[0] += (x1 * x1 + y1 * y1 &< radius_square); x1_opp = radius - x1; y1_opp = radius - y1; inside_count1[1] += (x1_opp * x1_opp + y1_opp * y1_opp &< radius_square); g_seed2 = (a * g_seed2+b); x2 = (g_seed2 &>&> c) d;
g_seed2 = (a * g_seed2+b);
y2 = (g_seed2 &>&> c) d;
inside_count2[0] += (x2 * x2 + y2 * y2 &< radius_square); x2_opp = radius - x2; y2_opp = radius - y2; inside_count2[1] += (x2_opp * x2_opp + y2_opp * y2_opp &< radius_square); g_seed3 = (a * g_seed3+b); x3 = (g_seed3 &>&> c) d;
g_seed3 = (a * g_seed3+b);
y3 = (g_seed3 &>&> c) d;
inside_count3[0] += (x3 * x3 + y3 * y3 &< radius_square); x3_opp = radius - x3; y3_opp = radius - y3; inside_count3[1] += (x3_opp * x3_opp + y3_opp * y3_opp &< radius_square); g_seed4 = (a * g_seed4+b); x4 = (g_seed4 &>&> c) d;
g_seed4 = (a * g_seed4+b);
y4 = (g_seed4 &>&> c) d;
inside_count4[0] += (x4 * x4 + y4 * y4 &< radius_square); x4_opp = radius - x4; y4_opp = radius - y4; inside_count4[1] += (x4_opp * x4_opp + y4_opp * y4_opp &< radius_square); } for (i = 0; i &< 2; i++) inside_count[index] += inside_count1[i] + inside_count2[i] + inside_count3[i] + inside_count4[i]; clock_gettime(CLOCK_MONOTONIC, tend); printf("thread %d: this computation took about %.5f seconds ", index, ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec)); } int main() { struct timespec tstart= {0,0}, tend= {0,0}; clock_gettime(CLOCK_MONOTONIC, tstart); v4si sum = {0, 0, 0, 0}; int count = 0; pthread_t tid[NUM_THREADS]; int i, j; for (i = 0; i &< NUM_THREADS; i++) pthread_create(tid + i, NULL, thread, (void *) i); for (i = 0; i &< NUM_THREADS; i++) pthread_join(tid[i], NULL); for (i = 0; i &< NUM_THREADS; i++) for (j = 0; j &< 4; j++) count += inside_count[i][j]; printf("%f ", -2.0 * count / simulate_total); clock_gettime(CLOCK_MONOTONIC, tend); printf("This computation took about %.5f seconds ", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec)); exit(0); }

這個版本用時0.260s左右

對於int類型,AVX只能使用128位的xmm寄存器,只能4路並行,AVX2可以使用256位的ymm寄存器,實現8路並行, 可惜AVX2只支持最新型的CPU。RDSEED和RDRAND可由硬體生成隨機數,同樣只支持最新型的CPU。如果使用AVX2、RDSEED和RDRAND, 運行時間應該可以進一步降低。

在上面多線程的基礎上再加上多進程,用時仍是0.340s左右。後來看了4線程時,CPU usage已經達到380%,所以在多線程的基礎上再加上多進程並沒有提升速度。


Python表示受到了極大的侮辱……人家有numpy……

2500000個隨機數?判斷?人人都知道python循環慢……

但是如果用這個辦法呢?

# coding: utf-8

from numpy.random import random
from numpy import where

import time

time_start = time.time()

simulate_total = 2500000

x, y = random(simulate_total), random(simulate_total)
p = x**2 + y**2
opp = (1 - x)**2 + (1 - y)**2
inside_count = where(p &< 1, 1, 0).sum() + where(opp &< 1, 1, 0).sum() print(inside_count * 1.0 / simulate_total * 2) print "Time spent", time.time() - time_start

0.146s,相當於146ms,是不是感覺有點疼?


有個最主要的原因是rand.VBA的rand可不是VBA實現的。所以rand本身和C++的速度相差無幾。再加上rand佔了你測試程序的大部分運行時間。總體性能也就差不到哪兒去。

性能這種事單拿一個具體的例子來討論很誤導也很難得出正確的結論,就好比java可以在很多case裡面比c++都還快,但是當你草率決定把c++整個換成java的時候,你會發現最終那個龐大的程序慢得讓你不可接受。


一個這麼簡單的程序出現這麼大的時間差別很明顯是rand函數的實現不同。你得先消除這個影響


一點python的奇技淫巧(numexpr, numba):

i7 4790k, Python3.5, MKL

import numpy as np
import numexpr as ne
from numba import jit
import time

simulate_total = 2500000

def fun1():
x, y = np.random.rand(simulate_total), np.random.rand(simulate_total)
inside_count = np.sum((x**2+y**2)&<1)+np.sum(((1-x)**2+(1-y)**2)&<1) return inside_count*2/simulate_total def fun2(): x, y = np.random.rand(simulate_total), np.random.rand(simulate_total) inside_count = ne.evaluate("sum(where(x**2 + y**2&<1, 1, 0))") + ne.evaluate("sum(where((1-x)**2 + (1-y)**2&<1, 1, 0))") return inside_count*2/simulate_total # jit大法好 @jit(nopython=True) def fun3(l, seed): g_seed = seed; cnt = 0 overflow = 0 for i in range(l): g_seed = (214013 * g_seed + 2531011); x = ((g_seed &>&> 16) 0x7FFF)
g_seed = (214013 * g_seed + 2531011);
y = ((g_seed &>&> 16) 0x7FFF)
if x*x + y*y &< 0x7FFF*0x7FFF: cnt += 1 x = 0x7FFF - x y = 0x7FFF - y if x*x + y*y &< 0x7FFF*0x7FFF: cnt += 1 return cnt*2/simulate_total

%timeit fun1()
10 loops, best of 3: 116 ms per loop

%timeit fun2()
10 loops, best of 3: 80.4 ms per loop

%timeit a.fun3(a.simulate_total, 123)
100 loops, best of 3: 6.16 ms per loop

jit本身要花十幾毫秒


拿樓主的代碼在我的macbook air上測了下。

樓主的C++的大約60ms左右,python的大約6秒。

milo_float版本也是60ms左右(並沒有什麼提升,估計是mac下的g++的rand()已經夠快了),milo_sse2恕我不太會C++,編譯不過。

@Muy Guapo 的python免loop版420ms左右。

隨手寫了一個js的,node上一跑,70ms。Firefox上跑,55ms。

然後花了半天時間,擼了一個asm.js with SIMD的版本,Firefox上跑,12ms。

嗯。

ASM+SIMD版本源碼:

const simulatePI_ASM_SIMD = function (stdlib, imports, buffer) {
"use asm"

const i4 = stdlib.SIMD.Int32x4
const f4 = stdlib.SIMD.Float32x4
const i4c = i4.check
const i4load = i4.load
const i4add = i4.add
const i4mul = i4.mul
const i4lt = i4.lessThan
const i4lane = i4.extractLane
const total = imports.total|0
const zeros = i4(0, 0, 0, 0)
const r = i4(0x40000000, 0x40000000, 0x40000000, 0x40000000)

var seed = 42
function random_i4() {
var u0 = 0, u1 = 0
seed = ((((seed|0) * 69069)|0) + 1)|0
u0 = seed 0x7fff
u1 = (seed &>&>&> 16) 0x7fff
return i4(u0, ~u0, u1, ~u1)
}

function simulatePI() {
var counts = zeros, count = 0
var a = zeros, b = zeros, c = zeros
var i = 0
var n = 0

for (i = 0, n = total &>&> 2; (i|0) &< (n|0); i = (i + 1)|0) { a = i4c(random_i4()) a = i4mul(a, a) b = i4c(random_i4()) b = i4mul(b, b) c = i4add(a, b) c = i4lt(c, r) counts = i4add(counts, c) } count = (0 - i4lane(counts, 0) - i4lane(counts, 1) - i4lane(counts, 2) - i4lane(counts, 3) )|0 return +(count|0) / +(total|0) * 4.0 } return simulatePI }(this, {total: 50000000}) setTimeout(function () { console.time("simulate pi - asm+simd") const pi_asm_simd = simulatePI_ASM_SIMD() console.timeEnd("simulate pi - asm+simd") console.log(pi_asm_simd) }, 1)


影響一個編譯型語言速度的原因只取決於語言實現和編譯器,而與語言具體是什麼無關。VB/VC都是編譯型的語言(VBA不是),所以理論上二者效率可以做到無差別。

樓上都在說C++如何可以更快,那我也來說說怎麼加速VB吧,在我看來,優化程度差不多的情況下VC和VB的效率差不會太明顯,甚至在某些情況下VB效率比VC要高

當然了,我用的是VB6來編譯release版本,VC的話,我用VC6,這樣比較才公平。

需要特別提示的是:VC6里rand函數並無效率的問題

結論先放到前頭,單線程情況下,VB6可能比VC6更快。

先看VC反彙編代碼,以下是循環部分的反彙編代碼,編譯環境為VC6、release其它編譯選項默認

.text:004010B3 loc_4010B3: ; CODE XREF: _main+A1 j
.text:004010B3 call _rand
.text:004010B8 mov esi, eax
.text:004010BA call _rand
.text:004010BF mov edi, eax
.text:004010C1 mov ecx, esi
.text:004010C3 imul eax, edi
.text:004010C6 imul ecx, esi
.text:004010C9 add eax, ecx
.text:004010CB cmp eax, 3FFF0001h
.text:004010D0 jnb short loc_4010D3
.text:004010D2 inc ebx
.text:004010D3
.text:004010D3 loc_4010D3: ; CODE XREF: _main+50 j
.text:004010D3 mov dword ptr [esp+2Ch+var_18], esi
.text:004010D7 mov dword ptr [esp+2Ch+var_18+4], ebp
.text:004010DB fild [esp+2Ch+var_18]
.text:004010DF fsubr ds:dbl_411110
.text:004010E5 call __ftol
.text:004010EA mov dword ptr [esp+2Ch+var_10], edi
.text:004010EE mov dword ptr [esp+2Ch+var_10+4], ebp
.text:004010F2 fild [esp+2Ch+var_10]
.text:004010F6 mov esi, eax
.text:004010F8 fsubr ds:dbl_411110
.text:004010FE call __ftol
.text:00401103 mov edx, eax
.text:00401105 imul edx, eax
.text:00401108 mov eax, esi
.text:0040110A imul eax, esi
.text:0040110D add edx, eax
.text:0040110F cmp edx, 3FFF0001h
.text:00401115 jnb short loc_401118
.text:00401117 inc ebx
.text:00401118
.text:00401118 loc_401118: ; CODE XREF: _main+95 j
.text:00401118 mov eax, [esp+2Ch+var_1C]
.text:0040111C dec eax
.text:0040111D mov [esp+2Ch+var_1C], eax
.text:00401121 jnz short loc_4010B3

再看VB反彙編的結果:

.text:00401853 loc_401853: ; CODE XREF: sub_4017E0+11A j
.text:00401853 fcomp [ebp+var_78]
.text:00401856 fnstsw ax
.text:00401858 test ah, 41h
.text:0040185B jz loc_4018FF
.text:00401861 lea eax, [ebp+var_58]
.text:00401864 mov [ebp+var_50], ebx
.text:00401867 push eax
.text:00401868 mov [ebp+var_58], edi
.text:0040186B call ds:rtcRandomNext
.text:00401871 fstp [ebp+var_30]
.text:00401874 lea ecx, [ebp+var_58]
.text:00401877 call esi ; __vbaFreeVar
.text:00401879 lea ecx, [ebp+var_58]
.text:0040187C mov [ebp+var_50], ebx
.text:0040187F push ecx
.text:00401880 mov [ebp+var_58], edi
.text:00401883 call ds:rtcRandomNext
.text:00401889 fstp [ebp+var_38]
.text:0040188C lea ecx, [ebp+var_58]
.text:0040188F call esi ; __vbaFreeVar
.text:00401891 fld ds:dbl_4010A0
.text:00401897 fsub [ebp+var_30]
.text:0040189A fld ds:dbl_4010A0
.text:004018A0 fsub [ebp+var_38]
.text:004018A3 fld [ebp+var_38]
.text:004018A6 fmul [ebp+var_38]
.text:004018A9 fld [ebp+var_30]
.text:004018AC fmul [ebp+var_30]
.text:004018AF faddp st(1), st
.text:004018B1 fcomp ds:dbl_4010A0
.text:004018B7 fnstsw ax
.text:004018B9 test ah, 1
.text:004018BC jz short loc_4018CA
.text:004018BE fld [ebp+var_20]
.text:004018C1 fadd ds:dbl_4010A0
.text:004018C7 fstp [ebp+var_20]
.text:004018CA
.text:004018CA loc_4018CA: ; CODE XREF: sub_4017E0+DC j
.text:004018CA fld st(1)
.text:004018CC fmul st, st(2)
.text:004018CE fld st(1)
.text:004018D0 fmul st, st(2)
.text:004018D2 faddp st(1), st
.text:004018D4 fcomp ds:dbl_4010A0
.text:004018DA fnstsw ax
.text:004018DC fstp st
.text:004018DE test ah, 1
.text:004018E1 fstp st
.text:004018E3 jz short loc_4018F1
.text:004018E5 fld [ebp+var_20]
.text:004018E8 fadd ds:dbl_4010A0
.text:004018EE fstp [ebp+var_20]
.text:004018F1
.text:004018F1 loc_4018F1: ; CODE XREF: sub_4017E0+103 j
.text:004018F1 fld [ebp+var_70]
.text:004018F4 fadd [ebp+var_18]
.text:004018F7 fst [ebp+var_18]
.text:004018FA jmp loc_401853

首先,VC在我這裡運行結果是70~80ms,VB在我這裡運行結果是300多ms,如果直接看代碼的話,VC並沒有用太多的硬浮點操作,反而是VB用了大量的硬浮點

那麼VB慢的唯一原因就是循環里的那幾個函數調用了:

rtcRandomNext

__vbaFreeVar

前面有人回答說VC的rand效率慢,但是VB的rand效率更慢

大多數VB的runtime API里,返回值的釋放都是需要調用者處理的,所以VB的任何runtime的API調用,都有可能觸發調用__vbaFreeVar,這個東西慢的很離譜。所以同樣的代碼VB比VC要慢的多。

知道為什麼這樣以後,把VB里的隨機函數也替換掉就可以了:

Private Randseed As Long

Private Function rand2() As Double
Randseed = Randseed * 1103515245 + 12345
Randseed = Randseed And H7FFFFFFF
rand2 = Randseed / H7FFFFFFF
End Function

需要注意的是,VB6里這個代碼會溢出的,所以編譯的時候還要做點修改,包括把VB里所有邊界檢查都去掉:

然後生成exe,直接運行:

對比一下:

最後貼一下VB的反彙編:

.text:004018EE loc_4018EE: ; CODE XREF: sub_401880+F3 j
.text:004018EE fst [ebp+var_18]
.text:004018F1 fcomp [ebp+var_5C]
.text:004018F4 fnstsw ax
.text:004018F6 test ah, 41h
.text:004018F9 jz short loc_401978
.text:004018FB call sub_4019E0
.text:00401900 fstp [ebp+var_30]
.text:00401903 call sub_4019E0
.text:00401908 fld ds:dbl_4010B0
.text:0040190E fsub [ebp+var_30]
.text:00401911 fstp [ebp+var_48]
.text:00401914 fld ds:dbl_4010B0
.text:0040191A fsub st, st(1)
.text:0040191C fstp [ebp+var_40]
.text:0040191F fld st
.text:00401921 fmul st, st(1)
.text:00401923 fld [ebp+var_30]
.text:00401926 fmul [ebp+var_30]
.text:00401929 faddp st(1), st
.text:0040192B fcomp ds:dbl_4010B0
.text:00401931 fnstsw ax
.text:00401933 test ah, 1
.text:00401936 fstp st
.text:00401938 jz short loc_401946
.text:0040193A fld [ebp+var_20]
.text:0040193D fadd ds:dbl_4010B0
.text:00401943 fstp [ebp+var_20]
.text:00401946
.text:00401946 loc_401946: ; CODE XREF: sub_401880+B8 j
.text:00401946 fld [ebp+var_48]
.text:00401949 fmul [ebp+var_48]
.text:0040194C fld [ebp+var_40]
.text:0040194F fmul [ebp+var_40]
.text:00401952 faddp st(1), st
.text:00401954 fcomp ds:dbl_4010B0
.text:0040195A fnstsw ax
.text:0040195C test ah, 1
.text:0040195F jz short loc_40196D
.text:00401961 fld [ebp+var_20]
.text:00401964 fadd ds:dbl_4010B0
.text:0040196A fstp [ebp+var_20]
.text:0040196D
.text:0040196D loc_40196D: ; CODE XREF: sub_401880+DF j
.text:0040196D fld [ebp+var_54]
.text:00401970 fadd [ebp+var_18]
.text:00401973 jmp loc_4018EE

所以VB也不是很慢的,看你怎麼用VB。

在我的環境里,一個優化過的VB比使用默認優化的VC要快很多,我相信再努力優化點,還可以更快。

那些用SSE、AVX甚至OpenMP的人就算了,太耍賴了。

利益相關,VB6十幾年的用戶。

最後補充幾句:

1、VBA是VB6的簡化版本,但效率略低,VB6編譯成的EXE效率是不錯的。

2、VB6誕生於1998年,以當時的技術水平衡量,VB6是一個相當不錯的開發工具。

3、VB6便於快速開發,雖然語法設計上有很多問題,但我仍然VB6的很多東西是無法超越的。

4、VB6與後來的http://VB.net不是同一種語言。

5、影響一個編譯型語言速度的原因只取決於語言實現和編譯器,而與語言具體是什麼無關

所以,不要小看VB。


這個問題告訴我們,Amdahl定律很重要。


chrome瀏覽器,筆記本。演示:Js Html Css 在線編輯器/Js代碼格式化

console.clear();
main();
//不使用random函數,但是這種奇葩算pi的方法下,就無法得到正確的pi值。
main(true);//使用Math.random函數。
function main(random) {
var ts = log_time();
var rand_max = 32767;
var simulate_total = 2500000;
var inside_count = 0;
var radius = rand_max * rand_max;
var randA;
var randB;
var randA_opp;
var randB_opp;
for (var i = 1; i &< simulate_total; i++) { randA = random ? rand_max * Math.random() : 0; randB = random ? rand_max * Math.random() : 0; if ((randA * randA + randB * randB) &< radius) { inside_count++; } randA_opp = rand_max - randA; randB_opp = rand_max - randB; if ((randA_opp * randA_opp + randB_opp * randB_opp) &< radius) { inside_count++; } } log_time(ts); console.log("pi = "+(inside_count / simulate_total * 2)); } function log_time(last) { var t = new Date().getTime(); if (last) { console.log("use time: " + (t - last) + " ms"); } return t; }

輸出:

use time: 30 ms//沒有調用 Math.random函數,JavaScript弄個高效的真正的隨機函數比較難,受制於語言本身。

pi = 1.9999992

use time: 586 ms

pi = 3.1419928

瓶頸也是在隨機函數,佔了95%的時間


由於問題中測試環境不明,我猜測測試的是http://VB.NET和CPython。

http://VB.NET僅僅是前端語法(C#, F#, J#之類的也是),都會被編譯成CIL位元組碼. 執行時JIT Compiler會將位元組碼再次翻譯為機器碼. 最終執行的都是機器碼,所以一點都不會慢的。之所以運行時間還有顯著差距,是因為函數調用和類型轉換的代價不同。託管對象的開銷肯定會大一些。

CPython沒JIT Compiler,速度慢些是肯定的。

如果想更快,可以考慮空間換時間,循環內的類型轉換和隨機數生成破壞了程序的局部性。修改掉這兩點會有顯著提升。

藉助新的指令集也會有收益,不過通用軟體一般不會用尚未普及的指令集。


我就奇了怪了,憑什麼覺得VBA會比C++慢很多?只用內置庫做某件事VBA比C++快我都不奇怪


如果OS創建進程的耗時是100ms(實際並沒有這麼大)

那Cpp就比VBA快10倍了。

這不是耍流氓么?


如果你用VB打包一個DLL讓你的VBA調用,你的VBA就一句話

sub main()

call RandPi

end sub

然後你就發現,兩者幾乎一樣快了。


我覺得吧,樓下答案都跑題了。樓主想問的是C++為什麼只比VBA快了4倍,而不是10倍,樓下的一直為C++洗白,一直去優化C++代碼。而忽略了問題的本質。VBA為什麼就要慢。雖然我也不知道為什麼會慢。

還有為什麼要和VBA比較呢,他們是兩種完全不同類型的語言。怎麼樣也可以和http://VB.net比一比或許還有應用上重疊的空間。


4倍不得了了好嗎?


歪個樓,快4倍這個說法存在語病。

因為嚴格來說這裡沒有「速度」的概念,你可以說博爾特跑步比我快4倍,因為這裡是指速度。但此處明顯是指時間長短,所以不能用倍數表情。


是的,你的代碼沒優化好


推薦閱讀:

正在學c++但是越學越覺得自己還有好多東西不知道?
為什麼現代CFD和PIC模擬大量採用C++編寫?針對這些模擬C++相對於C的優勢在哪?
C、C++、MATLAB、Python、Go 哪個比較適合寫演算法?
C++如何調用matlab庫函數?
linux下子進程退出狀態為什麼永遠是非正常?

TAG:編程 | 優化 | C | 隨機數 |