我似乎已經確定我可以信任的工具的問題...我可以信任哪一種工具?
我一直在測試的工具是Librosa和Kaldi在的audio file的40個濾波器能 地塊的可視化數據集的創建。
使用卡爾迪中的這些配置來提取濾波器組能量。
fbank.conf
--htk-compat=false
--window-type=hamming
--sample-frequency=16000
--num-mel-bins=40
--use-log-fbank=true
提取的數據是使用librosa
情節作圖。 Librosa
利用matplotlib
pcolormesh
,這意味着不應該有任何區別,除了librosa
提供了一個更容易使用的API。
print static.shape
print type(static)
print np.min(static)
print np.max(static)
fig = plt.figure()
librosa.display.specshow(static.T,sr=16000,x_axis='frames',y_axis='mel',hop_length=160,cmap=cm.jet)
#plt.axis('off')
plt.title("log mel power spectrum of " + name)
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
plt.savefig(plot+"/"+name+"_plot_static_conv.png")
plt.show()
輸出:
(474, 40)
<type 'numpy.ndarray'>
-1.828067
22.70058
Got bus address: "unix:abstract=/tmp/dbus-aYbBS1JWyw,guid=17dd413abcda54272e1d93d159174cdf"
Connected to accessibility bus at: "unix:abstract=/tmp/dbus-aYbBS1JWyw,guid=17dd413abcda54272e1d93d159174cdf"
Registered DEC: true
Registered event listener change listener: true
類似的情節在Librosa創建爲這樣:
audio_path="../../../../Dropbox/SI1392.wav"
#audio_path = librosa.util.example_audio_file()
print "Example audio found"
y, sr = librosa.load(audio_path)
print "Example audio loaded"
specto = librosa.feature.melspectrogram(y, sr=sr, n_fft=400, hop_length=160, n_mels=40)
print "Example audio spectogram"
log_specto = librosa.core.logamplitude(specto)
print "min and max"
print np.min(log_specto)
print np.max(log_specto)
print "Example audio log specto"
plt.figure(figsize=(12,4))
librosa.display.specshow(log_specto,sr=sr,x_axis='frames', y_axis='mel', hop_length=160,cmap=cm.jet)
plt.title('mel power spectrogram')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
print "See"
print specto.shape
print log_specto.shape
plt.show()
輸出該:
libraries loaded!
Example audio found
Example audio loaded
Example audio spectogram
min and max
-84.6796661558
-4.67966615584
Example audio log specto
See
(40, 657)
(40, 657)
儘管有顏色,但兩者都顯示類似的圖,但能量範圍似乎有點不同。
Kaldi有-1.828067/22.70058
最小/ MAX和Librosa具有最小/最大-84.6796661558/-4.67966615584
問題是我想保存這些地塊作爲numpy的陣列,用於進一步處理。
這似乎創造一個不同的情節.. 使用Librosa數據,我創建的情節一樣:
plt.figure()
min_max_scaled_log_specto = min_max_scaler.fit_transform(log_specto)
convert = plt.get_cmap(cm.jet)
numpy_static = convert(min_max_scaled_log_specto)
plt.imshow(np.flipud(log_specto), aspect='auto')
plt.colorbar()
print "Sooo?"
plt.show()
這是完美的......它類似於原始數據集..
但隨着Kaldi我從這個代碼這個情節:
convert = plt.get_cmap(cm.jet)
numpy_output_static = convert(np.flipud(static.T))
plt.imshow(numpy_output_static,aspect = 'auto')
plt.show()
raw_input("sadas")
我從以前的帖子,對於紅色發生的歷史的原因可能是由於範圍和標準化前,將有助於發現 - 但這導致此:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
convert = plt.get_cmap(cm.jet)
numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T)))
plt.imshow(numpy_output_static,aspect = 'auto')
plt.show()
但是,這絕不可能與Kaldi陰謀的原始陰謀有關...那麼,爲什麼它看起來像這樣?爲什麼我能夠用從Librosa提取的能量而不是從Kaldi提取的能量來繪製它?對於Librosa
最小工作示例:與kaldi
#
# Minimal example of Librosa plot example.
# Made for testing the plot, and test for accurat
# Conversion between the two parts.
#
import os
import sys
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import Normalize
import matplotlib
from PIL import Image
import librosa
import colormaps as cmaps
import librosa.display
import ast
from scipy.misc import toimage
from matplotlib import cm
from sklearn import preprocessing
print "libraries loaded!"
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
audio_path="../../../../Dropbox/SI1392.wav"
#audio_path = librosa.util.example_audio_file()
print "Example audio found"
y, sr = librosa.load(audio_path)
print "Example audio loaded"
specto = librosa.feature.melspectrogram(y, sr=sr, n_fft=400, hop_length=160, n_mels=40)
print "Example audio spectogram"
log_specto = librosa.core.logamplitude(specto)
print "min and max"
print np.min(log_specto)
print np.max(log_specto)
print "Example audio log specto"
plt.figure(figsize=(12,4))
librosa.display.specshow(log_specto,sr=sr,x_axis='frames', y_axis='mel', hop_length=160,cmap=cm.jet)
plt.title('mel power spectrogram')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
print "See"
#plt.show()
print specto.shape
print log_specto.shape
plt.figure()
min_max_scaled_log_specto = min_max_scaler.fit_transform(log_specto)
convert = plt.get_cmap(cm.jet)
numpy_static = convert(min_max_scaled_log_specto)
plt.imshow(np.flipud(log_specto), aspect='auto')
plt.colorbar()
print "Sooo?"
plt.show()
最小工作示例 - (真實數據):
#
# Extracted version:
#
#
#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from PIL import Image
import librosa
import librosa.display
from matplotlib import cm
from sklearn import preprocessing
import ast
import urllib
import os
import sys
from os import listdir
from os.path import isfile, join
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
def make_plot_store_data(name,interweaved,static,delta,delta_delta,isTrain,isTest,isDev):
print static.shape
print type(static)
print np.min(static)
print np.max(static)
fig = plt.figure()
librosa.display.specshow(static.T,sr=16000,x_axis='frames',y_axis='mel',hop_length=160,cmap=cm.jet)
#plt.axis('off')
plt.title("log mel power spectrum of " + name)
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
#plt.show()
#plt.close()
#raw_input("asd")
if isTrain == True:
plt.figure()
convert = plt.get_cmap(cm.jet)
numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T)))
plt.imshow(numpy_output_static,aspect = 'auto')
plt.show()
raw_input("sadas")
link = "https://gist.githubusercontent.com/Miail/51311b34f5e5333bbddf9cb17c737ea4/raw/786b72477190023e93b9dd0cbbb43284ab59921b/feature.txt"
f = urllib.urlopen(link)
temp_list = []
for line in f:
entries = 0
data_splitted = line.split()
if len(data_splitted) == 2:
file_name = data_splitted[0]
else:
entries = 1+entries
if data_splitted[-1] == ']':
temp_list.extend([ast.literal_eval(i) for i in data_splitted[:-1]])
else:
temp_list.extend([ast.literal_eval(i) for i in data_splitted])
dimension = 120
entries = len(temp_list)/dimension
data = np.array(temp_list)
interweaved = data.reshape(entries,dimension)
static =interweaved[:,:-80]
delta =interweaved[:,40:-40]
delta_delta =interweaved[:,80:]
plot_interweaved = data.reshape(entries*3,dimension/3)
print static.shape
print delta.shape
print delta_delta.shape
make_plot_store_data(file_name,plot_interweaved,static,delta,delta_delta,True,False,False)