我開始用C語言編寫一個簡單的音頻解壓縮程序。但是現在我在幾種不同的音頻容器類型中遇到了相同的音頻編碼,並決定我想擴展解壓縮器並做更多的「通用轉換器」,所以我因爲我對它更加熟悉,所以轉移到了Python上,並且從長遠來看,這對我來說可能會更容易。我在測試Python等價物時注意到的第一件事是,與C版相比,它明顯較慢。與C等價物相比,減壓程序非常慢?
C版本是這樣的:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
uint8_t BYTES_PER_BLOCK = 16;
uint8_t SAMPLES_PER_BLOCK = 28;
uint8_t FLAG_END = 0b00000001;
uint8_t FLAG_LOOP_CONTEXT = 0b00000010;
uint8_t FLAG_LOOP_START = 0b00000100;
double coeffs[5][2] = {
{ 0.0, 0.0 },
{ 60.0/64.0, 0.0 },
{ 115.0/64.0, -52.0/64.0 },
{ 98.0/64.0, -55.0/64.0 },
{ 122.0/64.0, -60.0/64.0 }
};
uint32_t filesize(FILE *f)
{
uint32_t filesize, offset;
offset = ftell(f);
fseek(f, 0, SEEK_END);
filesize = ftell(f);
fseek(f, offset, SEEK_SET);
return filesize;
}
int clamp_s16(int32_t val)
{
if (val > 32767)
return 32767;
if (val < -32768)
return -32768;
return val;
}
void decompress_adpcm(uint8_t *cmpbuf, FILE *outfile, uint32_t blocks_to_do, int32_t hist1, int32_t hist2, int loops)
{
int block_num;
int sample_num;
int predict_nr;
int shift_factor;
uint8_t flag;
int32_t loop_start = -1;
int l;
short scale;
short sample_byte;
int sample;
int16_t outbuf[1];
for (block_num = 0; block_num < blocks_to_do; block_num++)
{
predict_nr = cmpbuf[block_num * 16 + 0] >> 4;
shift_factor = cmpbuf[block_num * 16 + 0] & 0x0F;
flag = cmpbuf[block_num * 16 + 1];
if (flag & FLAG_LOOP_START)
{
if (flag & FLAG_LOOP_CONTEXT)
{
loop_start = block_num;
}
}
for (sample_num = 0; sample_num < SAMPLES_PER_BLOCK; sample_num++)
{
sample = 0;
if(flag < 0x07)
{
sample_byte = cmpbuf[(block_num * BYTES_PER_BLOCK) + 2 + (sample_num/2)];
scale = ((sample_num & 1 ? sample_byte >> 4 : sample_byte & 0x0F) << 12);
sample = (int)((scale >> shift_factor) + (hist1 * coeffs[predict_nr][0]) + (hist2 * coeffs[predict_nr][1]));
}
outbuf[0] = clamp_s16(sample);
fwrite(&outbuf[0], 2, 1, outfile);
hist2 = hist1;
hist1 = sample;
}
}
if (loops > 0)
{
if (loop_start >= 0)
{
for (l=0; l<loops; l++)
{
decompress_adpcm(&cmpbuf[loop_start*16], outfile, blocks_to_do - loop_start, hist1, hist2, 0);
}
}
}
}
int main()
{
FILE *cmpfile = fopen("C:\\test.adpcm", "rb");
uint32_t cmpsize = filesize(cmpfile);
uint8_t *cmpbuf = calloc(1, cmpsize);
fread(cmpbuf, cmpsize, 1, cmpfile);
FILE *outfile = fopen("C:\\test_c.raw", "wb");
decompress_adpcm(cmpbuf, outfile, cmpsize/16, 0, 0, 3);
return 0;
}
Python的版本是這樣的:
import struct
BYTES_PER_BLOCK = 16
SAMPLES_PER_BLOCK = 28
FLAG_END = 0b00000001
FLAG_LOOP_CONTEXT = 0b00000010
FLAG_LOOP_START = 0b00000100
coeffs = {
0: {0: 0.0, 1: 0.0},
1: {0: 60.0/64.0, 1: 0.0},
2: {0: 115.0/64.0, 1: -52.0/64.0},
3: {0: 98.0/64.0, 1: -55.0/64.0},
4: {0: 122.0/64.0, 1: -60.0/64.0}
}
s16_t = struct.Struct("<h")
def s32(n):
return int(((n + 0x80000000) % 0x100000000) - 0x80000000)
def s16(n):
return int(((n + 0x8000) % 0x10000) - 0x8000)
def put_s16_le(n):
return s16_t.pack(n)
def clamp_s16(n):
if n > 32767:
return 32767
if n < -32768:
return -32768
return n
def decompress_adpcm(cmpbuf, outfile, blocks_to_do, hist1=0, hist2=0, loops=0):
loop_start = -1
for block_num in range(blocks_to_do):
predict_nr = cmpbuf[(block_num * BYTES_PER_BLOCK) + 0] >> 4
shift_factor = cmpbuf[(block_num * BYTES_PER_BLOCK) + 0] & 0x0F
flag = cmpbuf[(block_num * BYTES_PER_BLOCK) + 1]
if flag & FLAG_LOOP_START:
if flag & FLAG_LOOP_CONTEXT:
loop_start = block_num
for sample_num in range(SAMPLES_PER_BLOCK):
sample = 0
if flag < 0x07:
adpcm_byte = cmpbuf[(block_num * BYTES_PER_BLOCK) + 2 + (sample_num//2)]
if sample_num & 1:
scale = adpcm_byte >> 4
else:
scale = adpcm_byte & 0x0F
scale = s16(scale << 12)
sample = s32((scale >> shift_factor) + (hist1 * coeffs[predict_nr][0]) + (hist2 * coeffs[predict_nr][1]))
outfile.write(put_s16_le(clamp_s16(sample)))
hist2 = hist1
hist1 = sample
if loops > 0:
if loop_start >= 0:
for l in range(loops):
decompress_adpcm(cmpbuf[loop_start:loop_start + ((blocks_to_do - loop_start) * BYTES_PER_BLOCK)], outfile, hist1, hist2)
def main():
with open(r"C:\test.adpcm", "rb") as cmpf:
cmpbuf = cmpf.read()
with open(r"C:\test_py.raw", "wb") as out:
decompress_adpcm(cmpbuf, outf, len(cmpbuf)//BYTES_PER_BLOCK, loops=3)
return 0
if __name__=="__main__":
main()
這是我得到一個profile
運行:
1647764 function calls (1647761 primitive calls) in 8.219 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 8.219 8.219 :0(exec)
8 0.000 0.000 0.000 0.000 :0(len)
2 0.000 0.000 0.000 0.000 :0(open)
274624 0.344 0.000 0.344 0.000 :0(pack)
1 0.000 0.000 0.000 0.000 :0(read)
1 0.000 0.000 0.000 0.000 :0(setprofile)
274624 1.234 0.000 1.234 0.000 :0(write)
1 0.000 0.000 8.219 8.219 <string>:1(<module>)
274624 0.625 0.000 0.625 0.000 test.py:105(s32)
274624 0.734 0.000 0.734 0.000 test.py:108(s16)
274624 0.875 0.000 1.219 0.000 test.py:111(put_s16_le)
274624 0.266 0.000 0.266 0.000 test.py:114(clamp_s16)
4/1 4.141 1.035 8.219 8.219 test.py:123(decompress_adpcm)
1 0.000 0.000 8.219 8.219 test.py:178(main)
1 0.000 0.000 8.219 8.219 profile:0(main())
0 0.000 0.000 profile:0(profiler)
在我的機器(Intel Core 2 Duo E8200 @ 2.67Ghz),C版本不到一秒就完成執行每次我測試運行它時,Python版本都需要大約8秒鐘(如上所述)才能完成。我使用相同的音頻文件測試了兩個版本,並且我沒有發現任何資源浪費或任何背景知識,可能會以某種方式影響Python的性能。
現在,我看到人們總是提出像「如果你想要速度,使用C」這樣的東西,我當然同意,但是肯定的是,在最好的情況下,Python不應該比C慢!我一直在盡力優化它,但我沒有看到任何重大改進。我做的最後一個調整是爲put_s16_le
添加一個靜態結構,這有助於一些,但仍然不是很多。
那麼有沒有什麼辦法來優化Python版本,或者我堅持在這裏呆滯的腳本?
如果它很重要,我使用Python 3.4.3。
完美正常的是,Python版本比C版本慢得多。我甚至對Python的速度慢了8倍感到驚訝。 –