2017-09-05 74 views
0

我有一個從嵌套列表轉換而來的嵌套莫斯科街道地址列表。但是,我從中進行地理編碼的數據框只有沒有郵政編碼的地址,並且在幾百(33k)的情況下,該地址爲具有不同郵政編碼的同一街道地址返回了多個結果。這會在列表中創建額外的嵌套,在轉換爲數據框時會導致與初始數據幀的觀察數量不同。具有可變嵌套層次的展平列表創建更多觀察

只有一個地址A結果具有以下結構: (忽略亂碼,R控制檯將不呈現西里爾正確)

structure(list(results = structure(list(address_components = list(
    structure(list(long_name = c("4", "óëèöà Áîëüøàÿ Àêàäåìè÷åñêàÿ", 
    "Ñåâåðíûé àäìèíèñòðàòèâíûé îêðóã", "Ìîñêâà", "Ìîñêâà", "Ðîññèÿ", 
    "127299"), short_name = c("4", "óë. Áîëüøàÿ Àêàäåìè÷åñêàÿ", 
    "Ñåâåðíûé àäìèíèñòðàòèâíûé îêðóã", "Ìîñêâà", "Ìîñêâà", "RU", 
    "127299"), types = list("street_number", "route", c("political", 
    "sublocality", "sublocality_level_1"), c("locality", "political" 
    ), c("administrative_area_level_2", "political"), c("country", 
    "political"), "postal_code")), .Names = c("long_name", "short_name", 
    "types"), class = "data.frame", row.names = c(NA, 7L))), 
    formatted_address = "óë. Áîëüøàÿ Àêàäåìè÷åñêàÿ, 4, Ìîñêâà, Ðîññèÿ, 127299", 
    geometry = structure(list(location = structure(list(lat = 55.8176896, 
     lng = 37.522891), .Names = c("lat", "lng"), class = "data.frame", row.names = 1L), 
     location_type = "ROOFTOP", viewport = structure(list(
      northeast = structure(list(lat = 55.8190385802915, 
       lng = 37.5242399802915), .Names = c("lat", "lng" 
      ), class = "data.frame", row.names = 1L), southwest = structure(list(
       lat = 55.8163406197085, lng = 37.5215420197085), .Names = c("lat", 
      "lng"), class = "data.frame", row.names = 1L)), .Names = c("northeast", 
     "southwest"), class = "data.frame", row.names = 1L)), .Names = c("location", 
    "location_type", "viewport"), class = "data.frame", row.names = 1L), 
    partial_match = TRUE, place_id = "ChIJ59yLsy1ItUYR5EEBFbFJoSA", 
    types = list("street_address")), .Names = c("address_components", 
"formatted_address", "geometry", "partial_match", "place_id", 
"types"), class = "data.frame", row.names = 1L), status = "OK"), .Names = c("results", 
"status")) 

而具有多個可能的地址的結果如下所示:

structure(list(results = structure(list(address_components = list(
    structure(list(long_name = c("23", "óëèöà Áîëüøàÿ Àêàäåìè÷åñêàÿ", 
    "Ñåâåðíûé àäìèíèñòðàòèâíûé îêðóã", "Ìîñêâà", "Ìîñêâà", "Ðîññèÿ", 
    "127299"), short_name = c("23", "óë. Áîëüøàÿ Àêàäåìè÷åñêàÿ", 
    "Ñåâåðíûé àäìèíèñòðàòèâíûé îêðóã", "Ìîñêâà", "Ìîñêâà", "RU", 
    "127299"), types = list("street_number", "route", c("political", 
    "sublocality", "sublocality_level_1"), c("locality", "political" 
    ), c("administrative_area_level_2", "political"), c("country", 
    "political"), "postal_code")), .Names = c("long_name", "short_name", 
    "types"), class = "data.frame", row.names = c(NA, 7L)), structure(list(
     long_name = c("23", "óëèöà Áîëüøàÿ Àêàäåìè÷åñêàÿ", "Ñåâåðíûé àäìèíèñòðàòèâíûé îêðóã", 
     "Ìîñêâà", "Ìîñêâà", "Ðîññèÿ", "125008"), short_name = c("23", 
     "óë. Áîëüøàÿ Àêàäåìè÷åñêàÿ", "Ñåâåðíûé àäìèíèñòðàòèâíûé îêðóã", 
     "Ìîñêâà", "Ìîñêâà", "RU", "125008"), types = list("street_number", 
      "route", c("political", "sublocality", "sublocality_level_1" 
      ), c("locality", "political"), c("administrative_area_level_2", 
      "political"), c("country", "political"), "postal_code")), .Names = c("long_name", 
    "short_name", "types"), class = "data.frame", row.names = c(NA, 
    7L))), formatted_address = c("óë. Áîëüøàÿ Àêàäåìè÷åñêàÿ, 23, Ìîñêâà, Ðîññèÿ, 127299", 
"óë. Áîëüøàÿ Àêàäåìè÷åñêàÿ, 23, Ìîñêâà, Ðîññèÿ, 125008"), geometry = structure(list(
    location = structure(list(lat = c(55.8169112, 55.826859), 
     lng = c(37.5202899, 37.529427)), .Names = c("lat", "lng" 
    ), class = "data.frame", row.names = 1:2), location_type = c("ROOFTOP", 
    "ROOFTOP"), viewport = structure(list(northeast = structure(list(
     lat = c(55.8182601802915, 55.8282079802915), lng = c(37.5216388802915, 
     37.5307759802915)), .Names = c("lat", "lng"), class = "data.frame", row.names = 1:2), 
     southwest = structure(list(lat = c(55.8155622197085, 
     55.8255100197085), lng = c(37.5189409197085, 37.5280780197085 
     )), .Names = c("lat", "lng"), class = "data.frame", row.names = 1:2)), .Names = c("northeast", 
    "southwest"), class = "data.frame", row.names = 1:2)), .Names = c("location", 
"location_type", "viewport"), class = "data.frame", row.names = 1:2), 
    partial_match = c(TRUE, TRUE), place_id = c("ChIJnVMw7C1ItUYRdfeWEQrXuAk", 
    "ChIJnbnwOdY3tUYR1_D9pHTqCsI"), types = list("street_address", 
     "street_address")), .Names = c("address_components", 
"formatted_address", "geometry", "partial_match", "place_id", 
"types"), class = "data.frame", row.names = 1:2), status = "OK"), .Names = c("results", 
"status")) 

在第二個列表中的results元素中,每個可能的地址都有一個額外的嵌套級別,當這個地址變扁時會爲該地址創建一個「額外」觀察值,從而使得不可能對將地理編碼結果返回到地址列表。我正在使用以下功能將我的嵌套列表平鋪到數據框架。如何在額外的嵌套發生時修改它們以僅佔用第一個地址?如果地址不正確,那麼當我稍後與另一個數據幀合併時,建築物將簡單地從樣本中丟棄,因此我只關心將每個地理編碼觀察匹配到原始數據框(地址的來源)中的相應行。

flatten_googleway <- function(df) { 
    require(jsonlite) 
    res <- jsonlite::flatten(df) 
    res[, names(res) %in% c("geometry.location_type", "geometry.location.lat", 
          "geometry.location.lng", "formatted_address")] 
} 
moscowhousegeo.df <- do.call(rbind, lapply(moscowhouse.list, function(x) { 
    if (length(x$results) == 0) template_res[1, ] else flatten_googleway(x$results) 
})) 

##template for NA results 
structure(list(formatted_address = character(0), geometry.location_type = character(0), 
    geometry.location.lat = numeric(0), geometry.location.lng = numeric(0)), .Names = c("formatted_address", 
"geometry.location_type", "geometry.location.lat", "geometry.location.lng" 
), row.names = integer(0), class = "data.frame") 

回答

0

哎呀,我像往常一樣大量過度複雜的事情。通過修改lapply()調用來替換所有沒有結果的列表元素,以及x$results$address_components大於長度1的元素(如返回多個可能的結果時),我可以簡單地修復此問題。

moscowhousegeo.df <- do.call(rbind, lapply(moscowhouse.list, function(x) { 
    if (length(x$results) == 0 | length(x$results$formatted_address) > 1) template_res[1, ] else flatten_googleway(x$results) 
})) 

我還是失去了一些數據這樣不幸的,但確定哪些地址是正確的出給定的很可能是太耗費時間的選項,並在數據集中有這麼多的意見有點傻。