3
我必須計算包含在5GB csv文件中的向量的相關矩陣。每行包含一個觀察每個隨機變量。要做到這一點,我寫了下面的:F#中的惰性關聯矩陣計算
let getCorrMatrix data =
let getMatrixInfo nCol (count,crossProd:float array array,sumVector:float array,sqVector:float array) (newLine:float array) =
for i in 0..(nCol-1) do
sumVector.[i]<-sumVector.[i]+newLine.[i]
sqVector.[i]<-sqVector.[i]+(newLine.[i]*newLine.[i])
for j in (i+1)..(nCol-1) do
crossProd.[i].[j-(i+1)]<-crossProd.[i].[j-(i+1)]+newLine.[i]*newLine.[j]
let newCount = count+1
//(newCount,newMatrix,newSumVector,newSqVector)
(newCount,crossProd,sumVector,sqVector)
//Get number of columns
let nCol = data|>Seq.head|>Seq.length
//Initialize objects for the fold
let matrixStart = Array.init nCol (fun i -> Array.create (nCol-i-1) 0.0)
let sumVector = Array.init nCol (fun _ -> 0.0)
let sqVector = Array.init nCol (fun _ -> 0.0)
let init = (0,matrixStart,sumVector,sqVector)
//Run the fold and obtain all the elements to build te correlation matrix
let (count,crossProd,sum,sq) =
data
|>PSeq.fold(getMatrixInfo nCol) init
//Compute averages standard deviations, and finally correlations
let averages = sum|>Array.map(fun s ->s/(float count))
let std = Array.zip3 sum sq averages
|> Array.map(fun (elemSum,elemSq,av)-> let temp = elemSq-2.0*av*elemSum+float(count)*av*av
sqrt (temp/(float count-1.0)))
//Map allteh elements to correlation
let rec getCorr i j =
if i=j then
1.0
elif i<j then
(crossProd.[i].[j-(i+1)]-averages.[i]*sum.[j]-averages.[j]*sum.[i]+(float count*averages.[i]*averages.[j]))/((float count-1.0)*std.[i]*std.[j])
else
getCorr j i
let corrMatrix = Array2D.init nCol nCol (fun i j -> getCorr i j)
corrMatrix
我已經測試它反對研究計算和它匹配。由於我打算反覆使用這個,如果你有一些反饋意見(或發現一個錯誤),將不勝感激。 (注意我發佈這個是因爲認爲它可能對其他人有用)。
感謝
感謝代碼運行得更快! (剛剛發現你的網站,非常酷!) – jlezard 2010-10-02 12:53:00
和'fun i j - > getCorr i j'只能是'getCorr'。 :-) – 2010-10-02 18:54:34