2013-10-02 36 views
1

我正在使用光譜數據,我試圖找到哪個波長組合最能預測特定的濃度水平。加速lm()優化數據幀

我已經得到了以下數據:

spectrum <- data.frame(structure(list(reference = c("00383_130927131406", "00383_130927131636" 
), concentration = c(785, 39), `200` = c(0.1818293, 0.1818728 
), `202` = c(0.2090413, 0.2052553), `204` = c(0.2317517, 0.2450542 
), `206` = c(0.2427286, 0.2486499), `208` = c(0.3005925, 0.2602714 
), `210` = c(0.3774263, 0.3418267), `212` = c(0.4162179, 0.3934315 
), `214` = c(0.483283, 0.4533348), `216` = c(0.5029044, 0.5114449 
), `218` = c(0.4974553, 0.5390385), `220` = c(0.4940954, 0.5900267 
), `222` = c(0.4953246, 0.6695098), `224` = c(0.481304, 0.7726094 
), `226` = c(0.4513558, 0.8644904), `228` = c(0.4198686, 0.8791566 
), `230` = c(0.3907493, 0.7864748), `232` = c(0.3591166, 0.6480582 
), `234` = c(0.3277905, 0.49029), `236` = c(0.3033395, 0.3715453 
), `238` = c(0.2875585, 0.2875996), `240` = c(0.274983, 0.2247074 
), `242` = c(0.2685759, 0.1875459), `244` = c(0.2638485, 0.1637982 
), `246` = c(0.2596794, 0.1469171), `248` = c(0.2566508, 0.1360679 
), `250` = c(0.2534968, 0.1289802), `252` = c(0.2487593, 0.1223562 
), `254` = c(0.2440191, 0.1170995), `256` = c(0.2390878, 0.1120935 
), `258` = c(0.2350084, 0.107386), `260` = c(0.2326146, 0.105811 
), `262` = c(0.231193, 0.1047314), `264` = c(0.2268821, 0.1026403 
), `266` = c(0.2228926, 0.1005902), `268` = c(0.2188771, 0.0981951 
), `270` = c(0.2141229, 0.0956424), `272` = c(0.2097481, 0.0937292 
), `274` = c(0.2053046, 0.0918917), `276` = c(0.1986362, 0.0895234 
), `278` = c(0.1925066, 0.0873742), `280` = c(0.1857309, 0.0845037 
), `282` = c(0.1798247, 0.0821695), `284` = c(0.173684, 0.0794527 
), `286` = c(0.1681736, 0.0774221), `288` = c(0.1636854, 0.075914 
), `290` = c(0.1600333, 0.0746672), `292` = c(0.1567454, 0.0735234 
), `294` = c(0.1537335, 0.0723853), `296` = c(0.1514025, 0.0713109 
), `298` = c(0.1496398, 0.0703921), `300` = c(0.1482034, 0.0702467 
), `302` = c(0.1455175, 0.0688483), `304` = c(0.1422886, 0.067099 
), `306` = c(0.1396007, 0.065432), `308` = c(0.1368657, 0.0632867 
), `310` = c(0.1347514, 0.0616299), `312` = c(0.1332553, 0.0602099 
), `314` = c(0.1318352, 0.0587635), `316` = c(0.1304504, 0.0576302 
), `318` = c(0.1291949, 0.056583), `320` = c(0.1272951, 0.0549143 
), `322` = c(0.1265529, 0.0536381), `324` = c(0.1259775, 0.0525203 
), `326` = c(0.1251506, 0.0515252), `328` = c(0.1242249, 0.0509617 
), `330` = c(0.1237719, 0.0502567), `332` = c(0.1234315, 0.0497051 
), `334` = c(0.1228537, 0.0491512), `336` = c(0.1219852, 0.0484357 
), `338` = c(0.1209263, 0.0478882), `340` = c(0.1198741, 0.0474155 
), `342` = c(0.1191702, 0.0461311), `344` = c(0.1190057, 0.046185 
), `346` = c(0.1191455, 0.0463802), `348` = c(0.1181966, 0.0457693 
), `350` = c(0.1172899, 0.045634), `352` = c(0.116555, 0.0453343 
), `354` = c(0.1159115, 0.0445265), `356` = c(0.1154331, 0.0440425 
), `358` = c(0.1148589, 0.043449), `360` = c(0.1143816, 0.0427716 
), `362` = c(0.114209, 0.04271), `364` = c(0.1133982, 0.0423733 
), `366` = c(0.1120933, 0.0421726), `368` = c(0.1119338, 0.042341 
), `370` = c(0.1102654, 0.0409827), `372` = c(0.1094487, 0.0409078 
), `374` = c(0.1090142, 0.0407193), `376` = c(0.1085882, 0.0399118 
), `378` = c(0.1085156, 0.0396651), `380` = c(0.1082574, 0.0396597 
), `382` = c(0.1076243, 0.0393841), `384` = c(0.1070805, 0.0390632 
), `386` = c(0.1067173, 0.0387599), `388` = c(0.1062678, 0.0382344 
), `390` = c(0.1057367, 0.037716), `392` = c(0.1056847, 0.0376194 
), `394` = c(0.1055271, 0.0375567), `396` = c(0.1053451, 0.0375475 
), `398` = c(0.1051099, 0.0374534), `400` = c(0.1054062, 0.037516 
), `402` = c(0.1052253, 0.0373257), `404` = c(0.1043745, 0.0368734 
), `406` = c(0.1034489, 0.0366216), `408` = c(0.1026329, 0.0364288 
), `410` = c(0.1018779, 0.035892), `412` = c(0.1016637, 0.0360423 
), `414` = c(0.1015407, 0.0360411), `416` = c(0.1007153, 0.0356497 
), `418` = c(0.1007498, 0.0355663), `420` = c(0.1004693, 0.0345159 
), `422` = c(0.0998567, 0.0338999), `424` = c(0.0998918, 0.0338306 
), `426` = c(0.0999174, 0.0338034), `428` = c(0.0998015, 0.0338135 
), `430` = c(0.0995906, 0.0338477), `432` = c(0.0989122, 0.0335684 
), `434` = c(0.097852, 0.0329977), `436` = c(0.09709, 0.0327005 
), `438` = c(0.0962624, 0.0324584), `440` = c(0.0957877, 0.0320588 
), `442` = c(0.0952125, 0.0318741), `444` = c(0.0950441, 0.0320549 
), `446` = c(0.0946589, 0.0318478), `448` = c(0.0944174, 0.0317015 
), `450` = c(0.0946495, 0.19), `452` = c(0.0945732, 0.0321195 
), `454` = c(0.0939588, 0.0314291), `456` = c(0.0932608, 0.0308688 
), `458` = c(0.0926701, 0.0306815), `460` = c(0.0923376, 0.0303084 
), `462` = c(0.0926626, 0.0307978), `464` = c(0.0939299, 0.0317353 
), `466` = c(0.0930716, 0.0309835), `468` = c(0.0929553, 0.0311153 
), `470` = c(0.0930083, 0.0313657), `472` = c(0.0926628, 0.0310591 
), `474` = c(0.0922276, 0.0306481), `476` = c(0.092029, 0.0305241 
), `478` = c(0.0915025, 0.0304446), `480` = c(0.0904626, 0.0299507 
), `482` = c(0.0893053, 0.0291639), `484` = c(0.0885098, 0.0286272 
), `486` = c(0.0888613, 0.0284567), `488` = c(0.0903306, 0.0288371 
), `490` = c(0.0918492, 0.0302382), `492` = c(0.092112, 0.0302303 
), `494` = c(0.0921006, 0.0303867), `496` = c(0.0918876, 0.0306633 
), `498` = c(0.0918417, 0.0303878), `500` = c(0.0918264, 0.030263 
), `502` = c(0.0915124, 0.0302606), `504` = c(0.0907789, 0.0301055 
), `506` = c(0.0902592, 0.0299799), `508` = c(0.0902585, 0.0297897 
), `510` = c(0.0903141, 0.0295952), `512` = c(0.0903744, 0.0300794 
), `514` = c(0.0906949, 0.0303025), `516` = c(0.0907089, 0.0298949 
), `518` = c(0.0905071, 0.0300078), `520` = c(0.0898525, 0.0290348 
), `522` = c(0.0893119, 0.0290643), `524` = c(0.088986, 0.0289325 
), `526` = c(0.0895112, 0.0285626), `528` = c(0.0897726, 0.0285274 
), `530` = c(0.0895805, 0.0287086), `532` = c(0.0891773, 0.0287757 
), `534` = c(0.0889579, 0.0287846), `536` = c(0.0884922, 0.028611 
), `538` = c(0.0877636, 0.028343), `540` = c(0.0880152, 0.0284555 
), `542` = c(0.0882578, 0.0282067), `544` = c(0.0883219, 0.028523 
), `546` = c(0.0882104, 0.0291039), `548` = c(0.0882543, 0.0290097 
), `550` = c(0.0880007, 0.0289382), `552` = c(0.0878873, 0.0289249 
), `554` = c(0.0874991, 0.0282089), `556` = c(0.0876973, 0.0285702 
), `558` = c(0.0875319, 0.0283379), `560` = c(0.087135, 0.0280649 
), `562` = c(0.0875509, 0.0285492), `564` = c(0.0876262, 0.0285577 
), `566` = c(0.0869524, 0.0280579), `568` = c(0.0869787, 0.0282936 
), `570` = c(0.0870611, 0.0281805), `572` = c(0.0868479, 0.0274971 
), `574` = c(0.0867729, 0.0280225), `576` = c(0.0872913, 0.0284508 
), `578` = c(0.0871356, 0.0287336), `580` = c(0.0864677, 0.0284608 
), `582` = c(0.0869201, 0.0286578), `584` = c(0.0867237, 0.0283562 
), `586` = c(0.0866258, 0.0280203), `588` = c(0.0866252, 0.0279319 
), `590` = c(0.0863437, 0.0276132), `592` = c(0.0861822, 0.0272436 
), `594` = c(0.0870913, 0.0279816), `596` = c(0.0876164, 0.0284226 
), `598` = c(0.0869303, 0.0277859), `600` = c(0.0864942, 0.027882 
), `602` = c(0.0862525, 0.0279344), `604` = c(0.0862981, 0.0269194 
), `606` = c(0.0865318, 0.0274518), `608` = c(0.086784, 0.0281402 
), `610` = c(0.0868649, 0.0275235), `612` = c(0.0870241, 0.0285917 
), `614` = c(0.0868249, 0.028492), `616` = c(0.0868225, 0.0277449 
), `618` = c(0.086214, 0.02755), `620` = c(0.0864288, 0.0277508 
), `622` = c(0.0863217, 0.0272391), `624` = c(0.0866155, 0.0272815 
), `626` = c(0.0872267, 0.0278741), `628` = c(0.0873636, 0.0279329 
), `630` = c(0.0868396, 0.0276163), `632` = c(0.0864614, 0.0279268 
), `634` = c(0.0862915, 0.0279218), `636` = c(0.0857102, 0.0269628 
), `638` = c(0.0858776, 0.0274795), `640` = c(0.0871622, 0.028363 
), `642` = c(0.0873245, 0.0278735), `644` = c(0.0873235, 0.0288401 
), `646` = c(0.0880486, 0.02906), `648` = c(0.0875043, 0.0273798 
), `650` = c(0.0869163, 0.026752), `652` = c(0.0871902, 0.0270702 
), `654` = c(0.0866511, 0.0266837), `656` = c(0.0868393, 0.0269093 
), `658` = c(0.0883237, 0.0275446), `660` = c(0.0880274, 0.0282655 
), `662` = c(0.0881567, 0.0283123), `664` = c(0.0878181, 0.0282416 
), `666` = c(0.0870453, 0.0282417), `668` = c(0.0869403, 0.0280012 
), `670` = c(0.0875342, 0.0284779), `672` = c(0.0881081, 0.0288431 
), `674` = c(0.0883047, 0.0280668), `676` = c(0.089067, 0.0283966 
), `678` = c(0.0899504, 0.0291833), `680` = c(0.0882103, 0.0272003 
), `682` = c(0.0871596, 0.0260054), `684` = c(0.0879612, 0.0267891 
), `686` = c(0.0873405, 0.0269836), `688` = c(0.0882181, 0.0278995 
), `690` = c(0.0884919, 0.028267), `692` = c(0.0887171, 0.0296971 
), `694` = c(0.0877552, 0.0283004), `696` = c(0.0879554, 0.0275502 
), `698` = c(0.0890788, 0.0286819), `700` = c(0.0886739, 0.0274891 
), `702` = c(0.0881579, 0.0268598), `704` = c(0.088684, 0.028465 
), `706` = c(0.0889838, 0.0276326), `708` = c(0.0897651, 0.0278372 
)), .Names = c("reference", "concentration", "200", "202", "204", 
"206", "208", "210", "212", "214", "216", "218", "220", "222", 
"224", "226", "228", "230", "232", "234", "236", "238", "240", 
"242", "244", "246", "248", "250", "252", "254", "256", "258", 
"260", "262", "264", "266", "268", "270", "272", "274", "276", 
"278", "280", "282", "284", "286", "288", "290", "292", "294", 
"296", "298", "300", "302", "304", "306", "308", "310", "312", 
"314", "316", "318", "320", "322", "324", "326", "328", "330", 
"332", "334", "336", "338", "340", "342", "344", "346", "348", 
"350", "352", "354", "356", "358", "360", "362", "364", "366", 
"368", "370", "372", "374", "376", "378", "380", "382", "384", 
"386", "388", "390", "392", "394", "396", "398", "400", "402", 
"404", "406", "408", "410", "412", "414", "416", "418", "420", 
"422", "424", "426", "428", "430", "432", "434", "436", "438", 
"440", "442", "444", "446", "448", "450", "452", "454", "456", 
"458", "460", "462", "464", "466", "468", "470", "472", "474", 
"476", "478", "480", "482", "484", "486", "488", "490", "492", 
"494", "496", "498", "500", "502", "504", "506", "508", "510", 
"512", "514", "516", "518", "520", "522", "524", "526", "528", 
"530", "532", "534", "536", "538", "540", "542", "544", "546", 
"548", "550", "552", "554", "556", "558", "560", "562", "564", 
"566", "568", "570", "572", "574", "576", "578", "580", "582", 
"584", "586", "588", "590", "592", "594", "596", "598", "600", 
"602", "604", "606", "608", "610", "612", "614", "616", "618", 
"620", "622", "624", "626", "628", "630", "632", "634", "636", 
"638", "640", "642", "644", "646", "648", "650", "652", "654", 
"656", "658", "660", "662", "664", "666", "668", "670", "672", 
"674", "676", "678", "680", "682", "684", "686", "688", "690", 
"692", "694", "696", "698", "700", "702", "704", "706", "708" 
))) 

列「濃度」是濃度和列200:708與吸收對應波長。通常我的示例數據幀更長(n> 50)。

我想要做的是找出是否可以用光譜分析(UV-vis)來監測廢水中磷和氮的濃度。因此我需要找到測量濃度與觀測波長之間的關係。我僅使用波長200-708nm,因此與質譜儀無法比擬。

我已經得到了以下工作代碼:

grid2 <- expand.grid(w1=seq(200,708, 2), w2=seq(200,708, 2)) 

colnames_spec <- colnames(spectrum) 

    wave2 <- lapply(1:nrow(grid2), function(j){ 
    w1 <- grid2$w1[j] 
    w2 <- grid2$w2[j] 
    cond2 <- colnames_spec==paste0("X",w2) 
    cond1 <- colnames_spec==paste0("X",w1) 
    fit <- lm(spectrum$concentration~spectrum[,cond1]+spectrum[,cond2]) 
    assign(paste("r",j,sep=""), value=c(w1,w2,summary(fit)$r.squared)) 
    } 
) 
    tot_rsq_2 <- do.call(rbind, wave2) 
    tot_rsq_2 <- as.data.frame(tot_rsq_2) 
    colnames(tot_rsq_2) <- c("wavelength.1", "wavelength.2", "r_squared") 
    max_rsq_2 <- head(tot_rsq_2[with(tot_rsq_2, order(r_squared, decreasing=T)), ],n=5) 
    print(max_rsq_2) 

腳本大約需要三分鐘,我跑,但有什麼辦法,以加快代碼?

此外,我想擴大線性模型來搜索三個波長的最佳。那麼grid2將有255 * 255 * 255行,這將需要很長時間才能運行。

任何幫助,非常感謝。

+1

您可以用低級函數lm.fit()替換lm()。 – rcs

+0

你有沒有考慮過使用「隨機森林」?好像它是爲這類問題而設計的。 –

+0

我並不十分熟悉隨機森林的概念。我認爲主要組件(PCA)是另一種選擇,儘管我不知道它是否會真正提高性能。 – Timror

回答

3

由於迴歸中預測變量的順序對R²無關緊要,因此可以使用combn而不是expand.grid,這意味着需要計算較少的迴歸。此外,你應該使用更快的函數進行迴歸。

spec <- as.matrix(spectrum[,-1]) 

library(RcppEigen) 
fun <- function(ind) { 
    fit <- fastLm(spec[,1]~spec[,ind[1]+1]+spec[,ind[2]+1]) 
    1-sum(fit$residuals^2)/sum((fit$fitted.values-mean.default(fit$fitted.values))^2) 
} 

res <- combn(ncol(spec)-1, 2, fun) 
res <- cbind.data.frame(t(matrix(names(spectrum)[combn(ncol(spec)-1, 2)+2],2)), 
      res) 
head(res) 
#  1 2 res 
# 1 X200 X202 1 
# 2 X200 X204 1 
# 3 X200 X206 1 
# 4 X200 X208 1 
# 5 X200 X210 1 
# 6 X200 X212 1 
+0

謝謝。你的代碼爲我加快了3次。用三個參數進行計算仍然需要很長時間。有任何想法嗎? – Timror

+0

是的,研究替代的可能性(@SamMason提到了一條可能的路徑)或者使用Rcpp在組合上實現循環。並行化也許值得考慮。 – Roland