2017-03-22 14 views
1

我使用rjson導入了一個json文件,並將其轉換爲data.frame,但所有數據都是橫向擴展的,列名包含密鑰信息。沒有按ID熔化的列

stations <- fromJSON(file = "station_information.json") 
test <- as.data.frame(stations[3]) 

什麼這貌似是:

> dim(test) 
[1] 2 5985 

> test[1:27] 
    data.stations.station_id data.stations.name data.stations.short_name 
1      72 W 52 St & 11 Ave     6926.01 
2      72 W 52 St & 11 Ave     6926.01 
    data.stations.lat data.stations.lon data.stations.region_id 
1   40.76727   -73.99393      71 
2   40.76727   -73.99393      71 
    data.stations.rental_methods data.stations.capacity 
1       KEY      39 
2     CREDITCARD      39 
    data.stations.eightd_has_key_dispenser data.stations.station_id.1 
1         FALSE       79 
2         FALSE       79 
     data.stations.name.1 data.stations.short_name.1 data.stations.lat.1 
1 Franklin St & W Broadway     5430.08   40.71912 
2 Franklin St & W Broadway     5430.08   40.71912 
    data.stations.lon.1 data.stations.region_id.1 data.stations.rental_methods.1 
1   -74.00667      71       KEY 
2   -74.00667      71      CREDITCARD 
    data.stations.capacity.1 data.stations.eightd_has_key_dispenser.1 
1      33         FALSE 
2      33         FALSE 
    data.stations.station_id.2 data.stations.name.2 data.stations.short_name.2 
1       82 St James Pl & Pearl St     5167.06 
2       82 St James Pl & Pearl St     5167.06 
    data.stations.lat.2 data.stations.lon.2 data.stations.region_id.2 
1   40.71117   -74.00017      71 
2   40.71117   -74.00017      71 
    data.stations.rental_methods.2 data.stations.capacity.2 
1       KEY      27 
2      CREDITCARD      27 
    data.stations.eightd_has_key_dispenser.2 
1         FALSE 
2         FALSE 

正如你可以看到,這不能固定用一個簡單的轉置t()melt()解決方案。我想知道我在導入或轉換爲data.frame時做了什麼錯誤,這給我留下了一個數據框,該數據框的索引應該是列名附加的行。

我已經試過這兩種方法,但我留下了相同的拉伸數據:

plyr::ldply(stations, data.frame) 

do.call(rbind, lapply(stations, data.frame, stringsAsFactors=FALSE)) 

最後,我想我的輸出看每9列被「腰斬」之類,並堆放到第一個9 - 所以我留下655行和9列任何建議,將不勝感激。

注:我直接從該link服用JSON(它不是一個大文件)

這是第一個27列,這應該被重新成形爲一個9×3數據幀的再現的例子:

> dput(df) 
structure(list(data.stations.station_id = structure(c(1L, 1L), class = "factor", .Label = "72"), 
    data.stations.name = structure(c(1L, 1L), class = "factor", .Label = "W 52 St & 11 Ave"), 
    data.stations.short_name = structure(c(1L, 1L), class = "factor", .Label = "6926.01"), 
    data.stations.lat = c(40.76727216, 40.76727216), data.stations.lon = c(-73.99392888, 
    -73.99392888), data.stations.region_id = c(71, 71), data.stations.rental_methods = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity = c(39, 39), data.stations.eightd_has_key_dispenser = c(FALSE, 
    FALSE), data.stations.station_id.1 = structure(c(1L, 1L), class = "factor", .Label = "79"), 
    data.stations.name.1 = structure(c(1L, 1L), class = "factor", .Label = "Franklin St & W Broadway"), 
    data.stations.short_name.1 = structure(c(1L, 1L), class = "factor", .Label = "5430.08"), 
    data.stations.lat.1 = c(40.71911552, 40.71911552), data.stations.lon.1 = c(-74.00666661, 
    -74.00666661), data.stations.region_id.1 = c(71, 71), data.stations.rental_methods.1 = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity.1 = c(33, 33), data.stations.eightd_has_key_dispenser.1 = c(FALSE, 
    FALSE), data.stations.station_id.2 = structure(c(1L, 1L), class = "factor", .Label = "82"), 
    data.stations.name.2 = structure(c(1L, 1L), class = "factor", .Label = "St James Pl & Pearl St"), 
    data.stations.short_name.2 = structure(c(1L, 1L), class = "factor", .Label = "5167.06"), 
    data.stations.lat.2 = c(40.71117416, 40.71117416), data.stations.lon.2 = c(-74.00016545, 
    -74.00016545), data.stations.region_id.2 = c(71, 71), data.stations.rental_methods.2 = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity.2 = c(27, 27), data.stations.eightd_has_key_dispenser.2 = c(FALSE, 
    FALSE)), .Names = c("data.stations.station_id", "data.stations.name", 
"data.stations.short_name", "data.stations.lat", "data.stations.lon", 
"data.stations.region_id", "data.stations.rental_methods", "data.stations.capacity", 
"data.stations.eightd_has_key_dispenser", "data.stations.station_id.1", 
"data.stations.name.1", "data.stations.short_name.1", "data.stations.lat.1", 
"data.stations.lon.1", "data.stations.region_id.1", "data.stations.rental_methods.1", 
"data.stations.capacity.1", "data.stations.eightd_has_key_dispenser.1", 
"data.stations.station_id.2", "data.stations.name.2", "data.stations.short_name.2", 
"data.stations.lat.2", "data.stations.lon.2", "data.stations.region_id.2", 
"data.stations.rental_methods.2", "data.stations.capacity.2", 
"data.stations.eightd_has_key_dispenser.2"), row.names = c(NA, 
-2L), class = "data.frame") 

所以輸出結構應該看起來像這樣(顯然值不是NA)。每一行代表原始數據幀的列名

> output 
    data.stations.station_id data.stations.name data.stations.short_name 
1      NA     NA      NA 
2      NA     NA      NA 
3      NA     NA      NA 
    data.stations.lat data.stations.lon data.stations.region_id 
1    NA    NA      NA 
2    NA    NA      NA 
3    NA    NA      NA 
    data.stations.rental_methods data.stations.capacity 
1       NA      NA 
2       NA      NA 
3       NA      NA 
    data.stations.eightd_has_key_dispenser 
1          NA 
2          NA 
3          NA 
+0

你會如何堆疊每9列的值? –

+1

就像'as.data。幀(矩陣(unlist(df),ncol = 9,byrow = TRUE))'也許 – Sotos

+0

這幾乎可行,但它將所有值連接在一起成對 –

回答

1

的附加索引號我會嘗試:

library(data.table) 
rbindlist(lapply(split(seq_along(df), c(0, (seq_along(df)%/%9)[-length(df)])), 
    function(x) df[, x]), use.names = FALSE) 
## data.stations.station_id  data.stations.name data.stations.short_name data.stations.lat 
## 1:      72   W 52 St & 11 Ave     6926.01   40.76727 
## 2:      72   W 52 St & 11 Ave     6926.01   40.76727 
## 3:      79 Franklin St & W Broadway     5430.08   40.71912 
## 4:      79 Franklin St & W Broadway     5430.08   40.71912 
## 5:      82 St James Pl & Pearl St     5167.06   40.71117 
## 6:      82 St James Pl & Pearl St     5167.06   40.71117 
## data.stations.lon data.stations.region_id data.stations.rental_methods 
## 1:   -73.99393      71       KEY 
## 2:   -73.99393      71     CREDITCARD 
## 3:   -74.00667      71       KEY 
## 4:   -74.00667      71     CREDITCARD 
## 5:   -74.00017      71       KEY 
## 6:   -74.00017      71     CREDITCARD 
## data.stations.capacity data.stations.eightd_has_key_dispenser 
## 1:      39         FALSE 
## 2:      39         FALSE 
## 3:      33         FALSE 
## 4:      33         FALSE 
## 5:      27         FALSE 
## 6:      27         FALSE 

也就是說,創建data.framelist A S與每個9列,rbind他們。這樣,轉換爲matrix時不會出現數據強制轉換的問題。

這導致6行×9列data.table。不知道你想用放棄行只有3排落得什麼規則....


但我認爲你正在試圖解決的是不存在的問題。嘗試閱讀你的數據是這樣的:

library(jsonlite) 
x <- fromJSON("https://gbfs.citibikenyc.com/gbfs/en/station_information.json") 
head(x[[3]]$stations) 
## station_id       name short_name  lat  lon region_id 
## 1   72    W 52 St & 11 Ave 6926.01 40.76727 -73.99393  71 
## 2   79  Franklin St & W Broadway 5430.08 40.71912 -74.00667  71 
## 3   82  St James Pl & Pearl St 5167.06 40.71117 -74.00017  71 
## 4   83 Atlantic Ave & Fort Greene Pl 4354.07 40.68383 -73.97632  71 
## 5  116    W 17 St & 8 Ave 6148.02 40.74178 -74.00150  71 
## 6  119  Park Ave & St Edwards St 4700.06 40.69609 -73.97803  71 
## rental_methods capacity eightd_has_key_dispenser 
## 1 KEY, CREDITCARD  39     FALSE 
## 2 KEY, CREDITCARD  33     FALSE 
## 3 KEY, CREDITCARD  27     FALSE 
## 4 KEY, CREDITCARD  62     FALSE 
## 5 KEY, CREDITCARD  39     FALSE 
## 6 KEY, CREDITCARD  19     FALSE 


dim(x[[3]]$stations) 
# [1] 665 9 
1

可以使用矩陣但要確保你的所有因素列字符,即

ind <- sapply(df, is.factor) 
df[ind] <- lapply(df[ind], as.character) 

final_df <- as.data.frame(matrix(unlist(df), ncol = 9, byrow = TRUE)) 

final_df[c(TRUE, FALSE),] 
# V1 V2      V3      V4  V5  V6   V7   V8   V9 
#1 72 72   W 52 St & 11 Ave   W 52 St & 11 Ave 6926.01 6926.01 40.76727216 40.76727216 -73.99392888 
#3 79 79 Franklin St & W Broadway Franklin St & W Broadway 5430.08 5430.08 40.71911552 40.71911552 -74.00666661 
#5 82 82 St James Pl & Pearl St St James Pl & Pearl St 5167.06 5167.06 40.71117416 40.71117416 -74.00016545 

在另一方面,作爲@ A5C1D2H2I1M1N2O1R2T1筆記,你可能會尋找代替:

as.data.frame(matrix(c(t(df)), ncol = 9, byrow = TRUE)) 
# V1      V2  V3  V4  V5 V6   V7 V8 V9 
#1 72   W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71  KEY 39 FALSE 
#2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71  KEY 33 FALSE 
#3 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71  KEY 27 FALSE 
#4 72   W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 CREDITCARD 39 FALSE 
#5 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 CREDITCARD 33 FALSE 
#6 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 CREDITCARD 27 FALSE 
+0

這看起來不是正確的答案.... – A5C1D2H2I1M1N2O1R2T1

+0

不太清楚。 OP需要澄清 – Sotos

+1

檢查'unlist(df,use.names = FALSE)'和'c(t(df))'之間的區別。我想你的意思是使用後者.... – A5C1D2H2I1M1N2O1R2T1