2015-08-26 152 views
0

我已經搜索並發現了很多接近的解決方案,但從未在最後工作過。這可能是非常簡單的事情,對於那些有經驗的人...在R轉置和組合多個數據幀與缺少數據和空白列名稱/ dcast之前重命名熔化的列

這是我的數據片段。這是由包jsonlite從JSON導入自動創建的。數據結構非常好,但我仍然無能爲力。 UPDATE2:我已經添加了低於

structure(list(rightsize = c(42L, 50L, 52L, 49L, 41L, 41L, 41L, 
41L, 41L, 45L, 47L, 42L, 45L, 46L, 42L, 44L, 44L, 37L, 44L, 41L 
), hitlen = c("", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", ""), linegroup = c("_", "_", "_", 
"_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", 
"_", "_", "_", "_"), leftsize = c(46L, 43L, 43L, 37L, 49L, 43L, 
43L, 45L, 45L, 43L, 44L, 46L, 45L, 46L, 44L, 43L, 54L, 45L, 51L, 
47L), leftspace = c("  ", "   ", "   ", 
"     ", "  ", "   ", "   ", "   ", 
"   ", "   ", "   ", "  ", "   ", 
"  ", "   ", "   ", "", "   ", " ", 
"  "), Left = list(structure(list(class = c("", "coll", 
""), str = c("patients with ", "chronic", " obstructive pulmonary" 
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("respect to ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str" 
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("While there is no cure for this ", 
    "chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "strc", "", "coll", ""), str = c(".", 
"</p><p>", "When patients with ", "chronic", " liver")), .Names = c("class", 
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
    class = c("", "coll", ""), str = c("bronchitis , and ", "chronic", 
    " obstructive pulmonary")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("offers the possibility that ", 
"chronic", " lung")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c(" , such as ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str" 
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("always as clear in other ", 
    "chronic", " incurable")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("may have the potential to prevent ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c(" half the estimated cost of all ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("is consistent with the tact that ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("used to treat ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str" 
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("ingredient for dietary therapy of ", 
    "chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("patients with ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str" 
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("greater for ", "chronic", 
    " obstructive pulmonary")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c(" departments , with schemes for ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("postponement of death by means of managing ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("certainly be ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str" 
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("cardiovascular disease , cancer , other ", 
    "chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("terminal illnesses are converted to ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L))), Right = list(structure(list(class = "", str = " who may be at risk of developing steroid"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " - plausibly related to exposure to environmental"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " , it can be treated , Black says . Antidepressants"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " ask what they can do to improve their condition"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " [ COPD ]) was 15 % (estimated within "), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " is part of the continuum of development"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " (70 , 71) and sleep apnea . Elevation"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " . Patients with heart failure highlight"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " other than heart disease , and helps us"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " in this country . Furthermore , the portion"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " are multigenic and multifactorial . Therefore"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " . Nasal corticosteroids are increasingly"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " such as diabetes mellitus or hyperlipidemia"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " (COPD) concluded exercise relieves dyspnea"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " than for any other disease. 5 The number"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " management in patients with COPD receiving"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " and disability is costly , and it is bound"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = c("", "strc", ""), str = c(" .", "</p><p>", "Much rarer condition , but people" 
    )), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = "", str = " , and in fact those rates have been rising"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " . The panel 's report is negative about"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L)), Kwic = list(structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L)), toknum = c(580661252L, 
585871494L, 572902309L, 596182644L, 611091300L, 604962106L, 605346237L, 
585102838L, 575701411L, 616556239L, 548908661L, 604489309L, 548601059L, 
617460845L, 585870185L, 591049175L, 581965276L, 592616458L, 592591831L, 
599295354L), rightspace = c("   ", " ", "", " ", "   ", 
"   ", "   ", "   ", "   ", "  ", 
"  ", "   ", "  ", "  ", "   ", "  ", 
"  ", "    ", "  ", "   "), Tbl_refs = list(
    "11.99.0023.006", "11.99.0031.001", "11.99.0012.004", "11.99.0046.013", 
    "11.99.0069.003", "11.99.0059.007", "11.99.0060.003", "11.99.0030.001", 
    "11.99.0016.007", "11.99.0077.021", "11.01.0003.015", "11.99.0059.003", 
    "11.01.0003.006", "11.99.0078.034", "11.99.0031.001", "11.99.0038.005", 
    "11.99.0025.005", "11.99.0040.006", "11.99.0040.006", "11.99.0051.011"), 
    ref = c("11.99.0023.006", "11.99.0031.001", "11.99.0012.004", 
    "11.99.0046.013", "11.99.0069.003", "11.99.0059.007", "11.99.0060.003", 
    "11.99.0030.001", "11.99.0016.007", "11.99.0077.021", "11.01.0003.015", 
    "11.99.0059.003", "11.01.0003.006", "11.99.0078.034", "11.99.0031.001", 
    "11.99.0038.005", "11.99.0025.005", "11.99.0040.006", "11.99.0040.006", 
    "11.99.0051.011")), .Names = c("rightsize", "hitlen", "linegroup", 
"leftsize", "leftspace", "Left", "Right", "Kwic", "toknum", "rightspace", 
"Tbl_refs", "ref"), class = "data.frame", row.names = c(NA, 20L 
)) 

我需要做的是1)轉這4個dataframes並在「類」指定的值是列標題的相關數據。注意#1,列數可能不同。還請注意(#2)某些列名將是「」。因此,the wonderful solution here產生了一些數據幀,其中一些列標題全部被垃圾填滿,使得下一步(數據幀合併)不可能,例如,

  1. 「」
  2. STRC
  3. 結構( 「當患者」 中,class = 「現狀」)
  4. COLL
  5. 結構( 「肝臟」,類= 「現狀」 )。

(該垃圾填頭似乎是那名「的」的,超越了第一。)

繼那一步,我隨後將需要合併這些dataframes,而佔遺漏值。 Rbind.fill會做伎倆,但只有當數據足夠統一時。我已搜索high & low尋求解決方案,並且尚未找到足以解決此問題的解決方案。

更新:我繼續嘗試熔化/鑄造。下面帶來的是非常接近可接受的,最終的解決方案:

require(reshape2) 
docx <- melt(documentdata$Left, id.vars = c("class")) 
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list) 

唯一的問題是,如前所述,空白「下課」導致該結構在dcast丟失:所有未命名的列的拉閘合併並且無序,例如

L1 variable Var.3 coll strc 
1 1 str patients with , obstructive pulmonary chronic 
2 2 str respect to , obstructive pulmonary chronic 
3 3 str While there is no cure for this , chronic 
4 4 str ., When patients with , liver chronic </p><p> 
5 5 str bronchitis , and , obstructive pulmonary chronic 

og數據中的關鍵「類」是變量「coll」,它總是至少有一個空白前和一個空白後。一種解決方案可能是在dcast之前創建名稱「pre-coll」和「post-coll」?

更新#3:這是一個可能的,儘管醜陋的解決方案。任何「清潔」選項?

require(reshape2) 
docx <- melt(documentdata$Left, id.vars = c("class")) 
pre <- which(docx$class %in% c("coll")) - 1 
post <- which(docx$class %in% c("coll")) + 1 
docx$class[pre] = "l.pre" 
docx$class[post] = "l.post" 
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list) 
docx.left <- docx[, c("l.pre", "coll", "l.post")] 

在此先感謝您的幫助。

+0

如果您共享一些數據以便複製和粘貼,則您更有可能獲得答案。爲此,請嘗試'dput(documentdata)'並將其粘貼到問題 – jeremycg

+0

謝謝。我已經完成了。提前致謝! – Mach5RacerGoGo

回答

3

讓我們從dplyr做到這一點:

library(dplyr) 
documentdata$Left %>% do.call(rbind, .) %>% 
         do(data.frame(pre = .[["str"]][which(.[["class"]]=="coll")-1], 
            coll = .[["str"]][which(.[["class"]]=="coll")], 
            post = .[["str"]][which(.[["class"]]=="coll")+1])) 

              pre coll     post 
1        patients with chronic obstructive pulmonary 
2         respect to chronic obstructive pulmonary 
3    While there is no cure for this chronic      
4       When patients with chronic     liver 
5       bronchitis , and chronic obstructive pulmonary 
6     offers the possibility that chronic     lung 
.... 
18        certainly be chronic obstructive pulmonary 
19 cardiovascular disease , cancer , other chronic      
20  terminal illnesses are converted to chronic 

編輯:解釋: dplyr有一個奇怪的語法。請參閱dplyr vignettedata wrangling cheat sheet。該%>%是從magrittr包管道和簡單地把一切的輸出管道上的左邊作爲第一個參數,如果該功能向右:

5 %>% c(1) 
#same as 
c(5, 1) 

可以使用.代表的東西如果您想在其他地方使用它,請轉到左邊。您可以子集.如果你喜歡(例如.[["str"]]):

5 %>% c(1, .) 
#same as 
c(1, 5) 

do允許我們做任何我們想要的計算,而不必擔心標準dplyr動詞 - 這是一個包裝。見?do

所以答案就是documentdata$Left,把它變成do.call(rbind, .),它摺疊列表(到目前爲止,這與do.call(rbind, documentdata$Left)相同)。我們將其傳送到do,這將生成一個新的數據幀,並從.中選擇相關列。

+0

那,先生,很漂亮。謝謝!如果可能的話,你能解釋一下嗎?我特別好奇「%>%」,「do,and」。「的用法。 – Mach5RacerGoGo

+0

不用擔心,看看編輯。 – jeremycg

+0

非常好。 – Mach5RacerGoGo