2016-05-13 144 views
10

我有幾千個音頻文件,我想用Keras和Theano對它們進行分類。到目前爲止,我生成了每個音頻文件的28x28譜圖(更大可能更好,但我只是試圖讓算法在這一點上工作)並將圖像讀入矩陣。所以最終我得到了這個大的圖像矩陣來饋送到網絡中進行圖像分類。Keras精度不會改變

在教程中,我發現這個MNIST分類代碼:

import numpy as np 

from keras.datasets import mnist 
from keras.models import Sequential 
from keras.layers.core import Dense 
from keras.utils import np_utils 

batch_size = 128 
nb_classes = 10 
nb_epochs = 2 

(X_train, y_train), (X_test, y_test) = mnist.load_data() 

X_train = X_train.reshape(60000, 784) 
X_test = X_test.reshape(10000, 784) 
X_train = X_train.astype("float32") 
X_test = X_test.astype("float32") 
X_train /= 255 
X_test /= 255 

print(X_train.shape[0], "train samples") 
print(X_test.shape[0], "test samples") 

y_train = np_utils.to_categorical(y_train, nb_classes) 
y_test = np_utils.to_categorical(y_test, nb_classes) 

model = Sequential() 

model.add(Dense(output_dim = 100, input_dim = 784, activation= "relu")) 
model.add(Dense(output_dim = 200, activation = "relu")) 
model.add(Dense(output_dim = 200, activation = "relu")) 
model.add(Dense(output_dim = nb_classes, activation = "softmax")) 

model.compile(optimizer = "adam", loss = "categorical_crossentropy") 

model.fit(X_train, y_train, batch_size = batch_size, nb_epoch = nb_epochs, show_accuracy = True, verbose = 2, validation_data = (X_test, y_test)) 
score = model.evaluate(X_test, y_test, show_accuracy = True, verbose = 0) 
print("Test score: ", score[0]) 
print("Test accuracy: ", score[1]) 

此代碼運行時,我得到預期的結果:

(60000L, 'train samples') 
(10000L, 'test samples') 
Train on 60000 samples, validate on 10000 samples 
Epoch 1/2 
2s - loss: 0.2988 - acc: 0.9131 - val_loss: 0.1314 - val_acc: 0.9607 
Epoch 2/2 
2s - loss: 0.1144 - acc: 0.9651 - val_loss: 0.0995 - val_acc: 0.9673 
('Test score: ', 0.099454972004890438) 
('Test accuracy: ', 0.96730000000000005) 

到現在爲止一切都完美地運行,然而,當我將上面的算法應用於我的數據集,準確性卡住了。

我的代碼如下:

import os 

import pandas as pd 

from sklearn.cross_validation import train_test_split 

from keras.models import Sequential 
from keras.layers.convolutional import Convolution2D, MaxPooling2D 
from keras.layers.core import Dense, Activation, Dropout, Flatten 
from keras.utils import np_utils 

import AudioProcessing as ap 
import ImageTools as it 

batch_size = 128 
nb_classes = 2 
nb_epoch = 10 


for i in range(20): 
    print "\n" 
# Generate spectrograms if necessary 
if(len(os.listdir("./AudioNormalPathalogicClassification/Image")) > 0): 
    print "Audio files are already processed. Skipping..." 
else: 
    print "Generating spectrograms for the audio files..." 
    ap.audio_2_image("./AudioNormalPathalogicClassification/Audio/","./AudioNormalPathalogicClassification/Image/",".wav",".png",(28,28)) 

# Read the result csv 
df = pd.read_csv('./AudioNormalPathalogicClassification/Result/result.csv', header = None) 

df.columns = ["RegionName","IsNormal"] 

bool_mapping = {True : 1, False : 0} 

nb_classes = 2 

for col in df: 
    if(col == "RegionName"): 
     a = 3  
    else: 
     df[col] = df[col].map(bool_mapping) 

y = df.iloc[:,1:].values 

y = np_utils.to_categorical(y, nb_classes) 

# Load images into memory 
print "Loading images into memory..." 
X = it.load_images("./AudioNormalPathalogicClassification/Image/",".png") 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

X_train = X_train.reshape(X_train.shape[0], 784) 
X_test = X_test.reshape(X_test.shape[0], 784) 
X_train = X_train.astype("float32") 
X_test = X_test.astype("float32") 
X_train /= 255 
X_test /= 255 

print("X_train shape: " + str(X_train.shape)) 
print(str(X_train.shape[0]) + " train samples") 
print(str(X_test.shape[0]) + " test samples") 

model = Sequential() 


model.add(Dense(output_dim = 100, input_dim = 784, activation= "relu")) 
model.add(Dense(output_dim = 200, activation = "relu")) 
model.add(Dense(output_dim = 200, activation = "relu")) 
model.add(Dense(output_dim = nb_classes, activation = "softmax")) 

model.compile(loss = "categorical_crossentropy", optimizer = "adam") 

print model.summary() 

model.fit(X_train, y_train, batch_size = batch_size, nb_epoch = nb_epoch, show_accuracy = True, verbose = 1, validation_data = (X_test, y_test)) 
score = model.evaluate(X_test, y_test, show_accuracy = True, verbose = 1) 
print("Test score: ", score[0]) 
print("Test accuracy: ", score[1]) 

AudioProcessing.py

import os 
import scipy as sp 
import scipy.io.wavfile as wav 
import matplotlib.pylab as pylab 
import Image 

def save_spectrogram_scipy(source_filename, destination_filename, size): 
    dt = 0.0005 
    NFFT = 1024  
    Fs = int(1.0/dt) 
    fs, audio = wav.read(source_filename) 
    if(len(audio.shape) >= 2): 
     audio = sp.mean(audio, axis = 1) 
    fig = pylab.figure()  
    ax = pylab.Axes(fig, [0,0,1,1])  
    ax.set_axis_off() 
    fig.add_axes(ax) 
    pylab.specgram(audio, NFFT = NFFT, Fs = Fs, noverlap = 900, cmap="gray") 
    pylab.savefig(destination_filename) 
    img = Image.open(destination_filename).convert("L") 
    img = img.resize(size) 
    img.save(destination_filename) 
    pylab.clf() 
    del img 

def audio_2_image(source_directory, destination_directory, audio_extension, image_extension, size): 
    nb_files = len(os.listdir(source_directory)); 
    count = 0 
    for file in os.listdir(source_directory): 
     if file.endswith(audio_extension):   
      destinationName = file[:-4] 
      save_spectrogram_scipy(source_directory + file, destination_directory + destinationName + image_extension, size) 
      count += 1 
      print ("Generating spectrogram for files " + str(count) + "/" + str(nb_files) + ".") 

ImageTools.py

import os 
import numpy as np 
import matplotlib.image as mpimg 
def load_images(source_directory, image_extension): 
    image_matrix = [] 
    nb_files = len(os.listdir(source_directory)); 
    count = 0 
    for file in os.listdir(source_directory): 
     if file.endswith(image_extension): 
      with open(source_directory + file,"r+b") as f: 
       img = mpimg.imread(f) 
       img = img.flatten()     
       image_matrix.append(img) 
       del img 
       count += 1 
       #print ("File " + str(count) + "/" + str(nb_files) + " loaded.") 
    return np.asarray(image_matrix) 

於是我運行上面的代碼,並免費獲贈:

Audio files are already processed. Skipping... 
Loading images into memory... 
X_train shape: (2394L, 784L) 
2394 train samples 
1027 test samples 
-------------------------------------------------------------------------------- 
Initial input shape: (None, 784) 
-------------------------------------------------------------------------------- 
Layer (name)     Output Shape     Param # 
-------------------------------------------------------------------------------- 
Dense (dense)     (None, 100)     78500 
Dense (dense)     (None, 200)     20200 
Dense (dense)     (None, 200)     40200 
Dense (dense)     (None, 2)      402 
-------------------------------------------------------------------------------- 
Total params: 139302 
-------------------------------------------------------------------------------- 
None 
Train on 2394 samples, validate on 1027 samples 
Epoch 1/10 
2394/2394 [==============================] - 0s - loss: 0.6898 - acc: 0.5455 - val_loss: 0.6835 - val_acc: 0.5716 
Epoch 2/10 
2394/2394 [==============================] - 0s - loss: 0.6879 - acc: 0.5522 - val_loss: 0.6901 - val_acc: 0.5716 
Epoch 3/10 
2394/2394 [==============================] - 0s - loss: 0.6880 - acc: 0.5522 - val_loss: 0.6842 - val_acc: 0.5716 
Epoch 4/10 
2394/2394 [==============================] - 0s - loss: 0.6883 - acc: 0.5522 - val_loss: 0.6829 - val_acc: 0.5716 
Epoch 5/10 
2394/2394 [==============================] - 0s - loss: 0.6885 - acc: 0.5522 - val_loss: 0.6836 - val_acc: 0.5716 
Epoch 6/10 
2394/2394 [==============================] - 0s - loss: 0.6887 - acc: 0.5522 - val_loss: 0.6832 - val_acc: 0.5716 
Epoch 7/10 
2394/2394 [==============================] - 0s - loss: 0.6882 - acc: 0.5522 - val_loss: 0.6859 - val_acc: 0.5716 
Epoch 8/10 
2394/2394 [==============================] - 0s - loss: 0.6882 - acc: 0.5522 - val_loss: 0.6849 - val_acc: 0.5716 
Epoch 9/10 
2394/2394 [==============================] - 0s - loss: 0.6885 - acc: 0.5522 - val_loss: 0.6836 - val_acc: 0.5716 
Epoch 10/10 
2394/2394 [==============================] - 0s - loss: 0.6877 - acc: 0.5522 - val_loss: 0.6849 - val_acc: 0.5716 
1027/1027 [==============================] - 0s 
('Test score: ', 0.68490593621422047) 
('Test accuracy: ', 0.57156767283349563) 

我試過改變網絡,增加了更多的時代,但是無論如何我總是得到相同的結果。我不明白爲什麼我會得到相同的結果。

任何幫助,將不勝感激。謝謝。

編輯: 我發現一個錯誤,其中像素值未被正確讀取。我下面固定的ImageTools.py:

import os 
import numpy as np 
from scipy.misc import imread 

def load_images(source_directory, image_extension): 
    image_matrix = [] 
    nb_files = len(os.listdir(source_directory)); 
    count = 0 
    for file in os.listdir(source_directory): 
     if file.endswith(image_extension): 
      with open(source_directory + file,"r+b") as f: 
       img = imread(f)     
       img = img.flatten()       
       image_matrix.append(img) 
       del img 
       count += 1 
       #print ("File " + str(count) + "/" + str(nb_files) + " loaded.") 
    return np.asarray(image_matrix) 

現在我真正得到灰度像素值從0到255,所以現在我將其除以255是有道理的。但是,我仍然得到相同的結果。

回答

15

最可能的原因是優化器不適合您的數據集。以下是文檔中的Keras optimizers列表。

我建議你先用默認參數值嘗試SGD。如果它仍然不起作用,請將學習率除以10.如有必要,請多做幾次。如果您的學習率達到1e-6,但仍然無法正常工作,那麼您還有其他問題。

綜上所述,替換此行:

model.compile(loss = "categorical_crossentropy", optimizer = "adam") 

與此:

opt = SGD(lr=0.01) 
model.compile(loss = "categorical_crossentropy", optimizer = opt) 

,並幾次改變學習率,如果它不能正常工作。

如果是這個問題,你應該看到在幾個紀元後損失會降低。

+0

當我試圖10^-5,準確率變爲0.53,在10^-6變爲0.43。其餘的是0.57。另外我在鏈接中嘗試了其他優化器,但結果是一樣的。 –

+1

你可以嘗試的另一件事是改變你如何規範你的數據。嘗試scikit學習StandardScaler。如果仍然不起作用,則需要更復雜的模型。 – TheWalkingCube

+0

是的,但它不是一個RNN,只是幾個完全連接的層。 – TheWalkingCube

3

經過一番檢查,我發現問題是數據本身。這是非常骯髒的,因爲在相同的輸入有2個不同的輸出,因此造成混亂。現在清理數據後,我的準確度達到了69%。仍然不夠好,但至少現在我可以從這裏開始工作,現在數據很清楚。

我用下面的代碼進行測試:

import os 
import sys 

import pandas as pd 
import numpy as np 

from keras.models import Sequential 
from keras.layers.convolutional import Convolution2D, MaxPooling2D 
from keras.layers.core import Dense, Activation, Dropout, Flatten 
from keras.utils import np_utils 

sys.path.append("./") 
import AudioProcessing as ap 
import ImageTools as it 


# input image dimensions 
img_rows, img_cols = 28, 28 
dim = 1 
# number of convolutional filters to use 
nb_filters = 32 
# size of pooling area for max pooling 
nb_pool = 2 
# convolution kernel size 
nb_conv = 3 

batch_size = 128 
nb_classes = 2 
nb_epoch = 200 

for i in range(20): 
    print "\n" 

## Generate spectrograms if necessary 
if(len(os.listdir("./AudioNormalPathalogicClassification/Image")) > 0): 
    print "Audio files are already processed. Skipping..." 
else: 
    # Read the result csv 
    df = pd.read_csv('./AudioNormalPathalogicClassification/Result/AudioNormalPathalogicClassification_result.csv', header = None, encoding = "utf-8") 

    df.columns = ["RegionName","Filepath","IsNormal"] 

    bool_mapping = {True : 1, False : 0} 

    for col in df: 
     if(col == "RegionName" or col == "Filepath"): 
      a = 3  
     else: 
      df[col] = df[col].map(bool_mapping) 

    region_names = df.iloc[:,0].values 
    filepaths = df.iloc[:,1].values 
    y = df.iloc[:,2].values 
    #Generate spectrograms and make a new CSV file 
    print "Generating spectrograms for the audio files..." 
    result = ap.audio_2_image(filepaths, region_names, y, "./AudioNormalPathalogicClassification/Image/", ".png",(img_rows,img_cols)) 
    df = pd.DataFrame(data = result) 
    df.to_csv("NormalVsPathalogic.csv",header= False, index = False, encoding = "utf-8") 

# Load images into memory 
print "Loading images into memory..." 
df = pd.read_csv('NormalVsPathalogic.csv', header = None, encoding = "utf-8") 
y = df.iloc[:,0].values 
y = np_utils.to_categorical(y, nb_classes) 
y = np.asarray(y) 

X = df.iloc[:,1:].values 
X = np.asarray(X) 
X = X.reshape(X.shape[0], dim, img_rows, img_cols) 
X = X.astype("float32") 
X /= 255 

print X.shape 

model = Sequential() 

model.add(Convolution2D(64, nb_conv, nb_conv, 
         border_mode='valid', 
         input_shape=(1, img_rows, img_cols))) 

model.add(Activation('relu')) 

model.add(Convolution2D(32, nb_conv, nb_conv)) 
model.add(Activation('relu')) 
model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) 

model.add(Dropout(0.25)) 

model.add(Flatten()) 

model.add(Dense(128)) 
model.add(Activation('relu')) 

model.add(Dropout(0.5)) 

model.add(Dense(nb_classes)) 
model.add(Activation('softmax')) 

model.compile(loss='categorical_crossentropy', optimizer='adadelta') 

print model.summary() 

model.fit(X, y, batch_size = batch_size, nb_epoch = nb_epoch, show_accuracy = True, verbose = 1) 
+0

這是非常骯髒的,因爲在相同的輸入有2個不同的輸出,因此造成混亂 - >你是什麼意思?這是**混淆** – Ralf

+0

我的意思是在數據標記中存在錯誤。同樣一些應該標記爲1的輸入標記爲0。 –

3

退房這個

sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) 

model.compile(loss = "categorical_crossentropy", 
       optimizer = sgd, 
       metrics=['accuracy'] 
      ) 

退房的documentation

我有更好的結果MNIST