2017-01-03 144 views
2
df1 <- data.frame(w = 1:4, x = c("a", "b", "b", "c"), y = NA) 
df1$y[df1$x == "c"] = 1 
df2 <- data.frame(x = c("a", "b"), y = 1:2, z = 3:4) 

我想用df2中的值更新df1以匹配行,包括所有df2列。更新另一個數據幀的數據幀行和列的子集

預期結果:

df1 
    w x y z 
1 1 a 1 3 
2 2 b 2 4 
3 3 b 2 4 
4 4 c 1 NA 

這裏是我的嘗試:

# add missing columns from df2 to df1 
df1[setdiff(colnames(df2), colnames(df1))] <- NA 
# update values in df1 from df2 for matching x 
df1[df1$x %in% df2$x, colnames(df2[,-1])] <- df2[match(df1$x, df2$x),-1] 

這不起作用,因爲DF2 [匹配(DF1 $ X,DF2 $ X), - 1]還包括不匹配。什麼是正確的方法來做到這一點? (我的實際數據幀有更多的行和列。)

編輯1:根據請求,該示例並不代表我的實際數據的完整複雜性。在現實生活中,我正在處理df辦公室(df1)和結果(df2)。列「x」是「地址」。

colnames(offices) 
[1] "org_order"     "organizations.api_path"  "type"      
[4] "uuid"      "name"      "street_1"     
[7] "street_2"     "postal_code_o"     "city"      
[10] "city_web_path"    "region"      "region_code2"    
[13] "region_web_path"    "country_o"      "country_code2"    
[16] "country_code3"    "country_web_path"   "latitude"     
[19] "longitude"     "created_at"     "updated_at"     
[22] "address"      "administrative_area_level_1" "administrative_area_level_2" 
[25] "country"     "locality"     "neighborhood"    
[28] "postal_code"    "route"      "street_number"    
colnames(results) 
[1] "address"      "administrative_area_level_1" "administrative_area_level_2" 
[4] "country"      "locality"     "neighborhood"    
[7] "postal_code"     "route"      "street_number"    
[10] "subpremise"     "postal_code_suffix"   "premise"      
[13] "political" 

請注意,這是在這個時間點的圖片。我希望df結果可以添加更多列(它是通過Google Maps API進行更新的,Google Maps API根據位置返回可變字段)。

編輯2:這是來自DF辦事處和結果的示例數據。

structure(list(org_order = c("1", "3", "4", "8", "9", "10", "11", 
"11", "13", "13", "14", "15", "16", "18", "21", "23", "24", "25", 
"26", "27"), organizations.api_path = c("organizations/care1st-health-plan-arizona", 
"organizations/telecommunications-and-information-administration", 
"organizations/multimatecollection-com", "organizations/ion-holdco", 
"organizations/itris-bv", "organizations/crius-energy", "organizations/uwc-mahindra-college", 
"organizations/uwc-mahindra-college", "organizations/ustech-discovery", 
"organizations/ustech-discovery", "organizations/play-park-structures", 
"organizations/wings-rotors", "organizations/attractive-world", 
"organizations/bigtoys", "organizations/intercontec-produkt-gmbh", 
"organizations/elitesingles", "organizations/rumi-x-limited", 
"organizations/gt-grandstands", "organizations/new-health-capital-partners", 
"organizations/urban-media-2"), type = c("Address", "Address", 
"Address", "Address", "Address", "Address", "Address", "Address", 
"Address", "Address", "Address", "Address", "Address", "Address", 
"Address", "Address", "Address", "Address", "Address", "Address" 
), uuid = c("e28deae838eb745ab093bb3d2f66963a", "ae150251316913fd72a1ba9df18bd686", 
"d49e135e7b61d01acf6609e1c5c1a34d", "359399c4f6db344f530897caabce666e", 
"2b4a469b217e5090611a539d952412f3", "3939cfa28f2191535bcdaf4297bfa488", 
"86620bc9b675150a847496bcfbb8f6b9", "ee869cc495e84ebdbbbbd6db0ed5d3c0", 
"f31c6e197494192ec47bcb36d14bc206", "2fb8855d0bcccde661aacbbf69d92a3e", 
"724967633f0be037df09bf63e8bd932d", "8d765e1b0111298f2e295883ac0c1f52", 
"ea0cd535b2a113b9c7d1cbec466963eb", "a7d69835d7abdbee8d28c77e9c54f176", 
"243b70fedc6ff9f6b68bb1ae1cb5f5a1", "7cec7f51b87fff47acf74a0424c88d30", 
"e1c4108f8e4d902ed3524e98418cb157", "666dffa11b2de8aa35307a35d19ec117", 
"d5d56654e892d46c303deebd9b71aaaa", "deb420cf0a669897e0058ab854e928bc" 
), name = c("Headquarters", "Headquarters", "Delhi", "Headquarters", 
"Headquarters", "HQ", "Israel Office", "Headquarters", "Israel Office", 
"Headquarters", "Headquarters", "Israel Office", "Headquarters", 
"Headquarters", "HQ", "Headquarters", "Headquarters & Showroom", 
"Headquarters", "Headquarters", "Israel Office"), street_1 = c("2355 E. Camelback Road", 
"1401 Constitution Ave NW", "", "80 State Street 7th Floor", 
"Spray Gaarde 46", "1055 Washington Blvd., 7th Floor", "", "Village Khubavali, PO Paud Taluka Mulshi", 
"", "231 Lagrange Street", "401 Chestnut St", "st. ha Rav Bar Shaul 6", 
"", "7721 New Market Street", "Bernrieder Str. 15", "", "UNIT 1801, 18/F WAGA COMMERCIAL CENTER", 
"2810 Sydney Road", "1350 Avenue of the Americas", ""), street_2 = c("Suite 300", 
"", "", "", "", "", "", "", "", "", "Suite 410", "", "", "", 
"", "", "99 WELLINGTON ST, CENTRAL,", "", "9th Floor", ""), postal_code_cb = c("85016", 
"20230", "", "12207", "03436", "06901", "", "412 108", "", "02132", 
"37402", "7625149", "", "98501", "94559", "", "", "33566-1173", 
"10019", ""), city = c("Phoenix", "Washington", "", "Albany", 
"Nieuwegein", "Stamford", "", "Pune", "", "Boston", "Chattanooga", 
"Rehovot", "Paris", "Olympia", "Niederwinkling", "Dover", "Hong Kong", 
"Plant City", "New York", ""), city_web_path = c("location/phoenix/e55cf011f3c746498d0d801bc399dc52", 
"location/washington/09d220d540bacd296f7a5035ff6e6bef", "location/india/44048bf7db640d7adb20fd3c1ebf47b0", 
"location/albany/b40478e7b38920051a2a0f501b6e8c65", "location/nieuwegein/93f71c3141f8b16e539839308acd911e", 
"location/stamford/01272fa9253161fb0b1e69fe0fa8a53c", "location/israel/ab89e1a9013f82613b5f60487350334d", 
"location/pune/2c69941daa1ea666db3cb74f2029ae62", "location/israel/ab89e1a9013f82613b5f60487350334d", 
"location/boston/9898e533996b05148cc5302f60454c02", "location/chattanooga/9ecb7425cc53be56a353c7a127bf2f56", 
"location/rehovot/fd22cd6db48f2c192068a1bc99853c08", "location/paris/5805284c00a4f0aab8115843398ff2bb", 
"location/olympia/b289b456f00ca6c0536b2032da46985f", "location/niederwinkling/026a83aec4e324d209cd3e1b257b55d5", 
"location/dover/796562878525d2957e1dc10a32a5e13e", "location/hong-kong/ad27df91c866cb08f3b624121637bf69", 
"location/plant-city/646eec50d4fc27131241e0bebdae3673", "location/new-york/d64b7615985cfbf44affaa89d70c4050", 
"location/israel/ab89e1a9013f82613b5f60487350334d"), region = c("Arizona", 
"District of Columbia", "", "New York", "Utrecht", "Connecticut", 
"", "Maharashtra", "", "Massachusetts", "Tennessee", "HaMerkaz", 
"Ile-de-France", "Washington", "Bayern", "Delaware", "Hong Kong Island", 
"Florida", "New York", ""), region_code2 = c("AZ", "DC", "", 
"NY", "9", "CT", "", "16", "", "MA", "TN", "2", "A8", "WA", "2", 
"DE", "", "FL", "NY", ""), region_web_path = c("", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "" 
), country_cb = c("United States", "United States", "India", 
"United States", "The Netherlands", "United States", "Israel", 
"India", "Israel", "United States", "United States", "Israel", 
"France", "United States", "Germany", "United States", "Hong Kong", 
"United States", "United States", "Israel"), country_code2 = c("US", 
"US", "IN", "US", "NL", "US", "IL", "IN", "IL", "US", "US", "IL", 
"FR", "US", "DE", "US", "HK", "US", "US", "IL"), country_code3 = c("USA", 
"USA", "IND", "USA", "NLD", "USA", "ISR", "IND", "ISR", "USA", 
"USA", "ISR", "FRA", "USA", "DEU", "USA", "HKG", "USA", "USA", 
"ISR"), country_web_path = c("", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", ""), latitude = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", ""), longitude = c("", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", ""), created_at = c("1475743389", 
"1475899280", "1475743519", "1475899256", "1475742451", "1475742715", 
"1475742786", "1475742171", "1475741845", "1475741660", "1475899240", 
"1475741599", "1475741472", "1475899218", "1475741445", "1475741067", 
"1475742388", "1475899207", "1475740101", "1475740590"), updated_at = c("1475743390", 
"1475899280", "1475743519", "1475899257", "1475742451", "1475742715", 
"1475742786", "1475742171", "1475741845", "1475741660", "1475899240", 
"1475741599", "1475741472", "1475899218", "1475741445", "1475741067", 
"1475742388", "1475899207", "1475740101", "1475740590"), address = c("2355 E. Camelback Road Suite 300 85016 Phoenix Arizona", 
"1401 Constitution Ave NW 20230 Washington District of Columbia", 
"", "80 State Street 7th Floor 12207 Albany New York", "Spray Gaarde 46 03436 Nieuwegein Utrecht", 
"1055 Washington Blvd., 7th Floor 06901 Stamford Connecticut", 
"", "Village Khubavali, PO Paud Taluka Mulshi 412 108 Pune Maharashtra", 
"", "231 Lagrange Street 02132 Boston Massachusetts", "401 Chestnut St Suite 410 37402 Chattanooga Tennessee", 
"st. ha Rav Bar Shaul 6 7625149 Rehovot HaMerkaz", "Paris Ile-de-France", 
"7721 New Market Street 98501 Olympia Washington", "Bernrieder Str. 15 94559 Niederwinkling Bayern", 
"Dover Delaware", "UNIT 1801, 18/F WAGA COMMERCIAL CENTER 99 WELLINGTON ST, CENTRAL, Hong Kong Hong Kong Island", 
"2810 Sydney Road 33566-1173 Plant City Florida", "1350 Avenue of the Americas 9th Floor 10019 New York New York", 
""), administrative_area_level_1 = c("Arizona", "District of Columbia", 
NA, "New York", NA, "Connecticut", NA, NA, NA, "Massachusetts", 
"Tennessee", "Center District", "?e-de-France", "Washington", 
"Bayern", "Delaware", NA, NA, NA, NA), administrative_area_level_2 = c("Maricopa County", 
NA, NA, "Albany County", NA, "Fairfield County", NA, NA, NA, 
"Suffolk County", "Hamilton County", "Rehovot", "Paris", "Thurston County", 
"Niederbayern", "Kent County", NA, NA, NA, NA), country = c("United States", 
"United States", NA, "United States", NA, "United States", NA, 
NA, NA, "United States", "United States", "Israel", "France", 
"United States", "Germany", "United States", NA, NA, NA, NA), 
    locality = c("Phoenix", "Washington", NA, "Albany", NA, "Stamford", 
    NA, NA, NA, "Boston", "Chattanooga", "Rehovot", "Paris", 
    "Tumwater", "Niederwinkling", "Dover", NA, NA, NA, NA), neighborhood = c("Camelback East Village", 
    "Northwest Washington", NA, "Downtown", NA, "Downtown", NA, 
    NA, NA, "West Roxbury", NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), postal_code = c("85016", "20230", NA, "12207", NA, "06901", 
    NA, NA, NA, "02132", "37402", NA, NA, "98501", "94559", NA, 
    NA, NA, NA, NA), route = c("East Camelback Road", "Constitution Avenue Northwest", 
    NA, "State Street", NA, "Washington Boulevard", NA, NA, NA, 
    "Lagrange Street", "Chestnut Street", "HaRav Bar Shaul Street", 
    NA, "New Market Street Southwest", "Bernrieder Stra?", NA, 
    NA, NA, NA, NA), street_number = c("2355", "1401", NA, "80", 
    NA, "1055", NA, NA, NA, "231", "401", "6", NA, "7721", "15", 
    NA, NA, NA, NA, NA), subpremise = c("300", NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, "410", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA)), .Names = c("org_order", "organizations.api_path", "type", 
"uuid", "name", "street_1", "street_2", "postal_code_cb", "city", 
"city_web_path", "region", "region_code2", "region_web_path", 
"country_cb", "country_code2", "country_code3", "country_web_path", 
"latitude", "longitude", "created_at", "updated_at", "address", 
"administrative_area_level_1", "administrative_area_level_2", 
"country", "locality", "neighborhood", "postal_code", "route", 
"street_number", "subpremise"), class = c("data.table", "data.frame" 
), row.names = c(NA, -20L), .internal.selfref = <pointer: 0x00000000001e0788>) 

structure(list(address = structure(1:18, .Label = c("2355 E. Camelback Road Suite 300 85016 Phoenix Arizona", 
    "1401 Constitution Ave NW 20230 Washington District of Columbia", 
    "80 State Street 7th Floor 12207 Albany New York", "Spray Gaarde 46 3436 Nieuwegein Utrecht", 
    "1055 Washington Blvd., 7th Floor 6901 Stamford Connecticut", 
    "Village Khubavali, PO Paud Taluka Mulshi 412 108 Pune Maharashtra", 
    "231 Lagrange Street 2132 Boston Massachusetts", "401 Chestnut St Suite 410 37402 Chattanooga Tennessee", 
    "st. ha Rav Bar Shaul 6 7625149 Rehovot HaMerkaz", "Paris Ile-de-France", 
    "7721 New Market Street 98501 Olympia Washington", "Bernrieder Str. 15 94559 Niederwinkling Bayern", 
    "Dover Delaware", "UNIT 1801, 18/F WAGA COMMERCIAL CENTER 99 WELLINGTON ST, CENTRAL, Hong Kong Hong Kong Island", 
    "2810 Sydney Road 33566-1173 Plant City Florida", "1350 Avenue of the Americas 9th Floor 10019 New York New York", 
    "Askanischer Platz 3 10963 Berlin Berlin", "Level 2, 145 Flinders Lane 03000 Melbourne Victoria" 
    ), class = "factor"), administrative_area_level_1 = c("Arizona", 
    "District of Columbia", "New York", NA, "Connecticut", NA, "Massachusetts", 
    "Tennessee", "Center District", "?e-de-France", "Washington", 
    "Bayern", "Delaware", "Hong Kong Island", "Florida", "New York", 
    "Berlin", "Victoria"), administrative_area_level_2 = c("Maricopa County", 
    NA, "Albany County", NA, "Fairfield County", NA, "Suffolk County", 
    "Hamilton County", "Rehovot", "Paris", "Thurston County", "Niederbayern", 
    "Kent County", NA, "Hillsborough County", "New York County", 
    NA, "Melbourne City"), country = c("United States", "United States", 
    "United States", NA, "United States", NA, "United States", "United States", 
    "Israel", "France", "United States", "Germany", "United States", 
    "Hong Kong", "United States", "United States", "Germany", "Australia" 
    ), locality = c("Phoenix", "Washington", "Albany", NA, "Stamford", 
    NA, "Boston", "Chattanooga", "Rehovot", "Paris", "Tumwater", 
    "Niederwinkling", "Dover", NA, "Plant City", "New York", "Berlin", 
    "Melbourne"), neighborhood = c("Camelback East Village", "Northwest Washington", 
    "Downtown", NA, "Downtown", NA, "West Roxbury", NA, NA, NA, NA, 
    NA, NA, "Central", NA, NA, NA, NA), postal_code = c("85016", 
    "20230", "12207", NA, "06901", NA, "02132", "37402", NA, NA, 
    "98501", "94559", NA, NA, "33566", "10019", "10963", "3000"), 
     route = c("East Camelback Road", "Constitution Avenue Northwest", 
     "State Street", NA, "Washington Boulevard", NA, "Lagrange Street", 
     "Chestnut Street", "HaRav Bar Shaul Street", NA, "New Market Street Southwest", 
     "Bernrieder Stra?", NA, "Wellington Street", "Sydney Road", 
     "6th Avenue", "Askanischer Platz", "Flinders Lane"), street_number = c("2355", 
     "1401", "80", NA, "1055", NA, "231", "401", "6", NA, "7721", 
     "15", NA, "99", "2810", "1350", "3", "145"), subpremise = c("300", 
     NA, NA, NA, NA, NA, NA, "410", NA, NA, NA, NA, NA, "1801", 
     NA, NA, NA, "2"), postal_code_suffix = c(NA, NA, NA, NA, 
     NA, NA, "3450", NA, NA, NA, NA, NA, NA, NA, "1173", NA, NA, 
     NA), premise = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
     NA, NA, NA, "Waga Commercial Centre", NA, NA, NA, NA), political = c(NA, 
     NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Manhattan", 
     "Bezirk Friedrichshain-Kreuzberg", NA)), .Names = c("address", 
    "administrative_area_level_1", "administrative_area_level_2", 
    "country", "locality", "neighborhood", "postal_code", "route", 
    "street_number", "subpremise", "postal_code_suffix", "premise", 
    "political"), row.names = c(NA, -18L), class = c("data.table", 
    "data.frame"), .internal.selfref = <pointer: 0x00000000001e0788>) 
+1

對於y列,最後的值應該是NA,就像列z一樣。 –

+1

我想我們只會從'df2'中取代它,以防它最初在'df1'中丟失。 – Aramis7d

+0

@Chirayu Chamoli這是有目的的,因爲整個事情實際上是一個迭代過程。 df2將用新行更新多次。我需要更新df1中的新行,而不是舊行。這也是爲什麼我無法一起完成整合的原因。 – syre

回答

6

這是一個完全霧化的data.table版本,用於更新兩個數據集中存在的所有列,並添加df2中同時不存在的df1中的列。這將在地方更新df1

cols <- setdiff(colnames(df2), "x") 
setDT(df1)[setDT(df2), (cols) := mget(paste0('i.', cols)), on = "x"] 
df1 
# w x y z 
# 1: 1 a 1 3 
# 2: 2 b 2 4 
# 3: 3 b 2 4 
# 4: 4 c 1 NA 

背後paste0('i.', cols)的想法是告訴data.table,我們要採取從位於i個位置(df2)的data.table列所以它會知道如何處理列存在於兩個數據集中。


免責聲明:這個想法是從這個@eddi's answer

+0

謝謝@David Arenburg! – syre

+0

如果我將目標df「即時」子集,例如'setDT(df1 [is.na(df1 $ y),])[setDT(df2),(cols):= mget(paste0('i。',cols)),on =「x」]'警告_Coerced'整數'RHS爲'double'以匹配列的類型_並且該命令不起作用。 – syre

+1

在R的同一列中不能有兩個不同的類。另外,'setDT(df1 [is.na(df1 $ y),])'不是一個正確的語法。也許只是'df1 [is.na(y)]'(如果'df'已經是'data.table')。另外,請看看這裏:http://stackoverflow.com/questions/29658627/conditional-binary-join-and-update-by-reference-using-the-data-table-package –

2

使用dplyr,你可以嘗試:

dplyr::left_join(df1,df2,by="x") %>% 
    dplyr::mutate(y = dplyr::coalesce(as.integer(y.x),as.integer(y.y))) %>% 
    dplyr::select(w,x,y,z) 

這給

# w x y z 
#1 1 a 1 3 
#2 2 b 2 4 
#3 3 b 2 4 
#4 4 c 1 NA 

附:看看您嘗試使用setdiff()而不是colnames,可能會發現所提供的示例並不代表您的實際問題的完整複雜性。在這種情況下,請更新示例。

+2

這也是一種手冊。你將不得不眼睛的結果,並手工修復所有列出現在這兩個數據集 –

+0

@DavidArenburg如果在合併時,我加了'suffix = c(「zzzz」,「zzzz」)',以便我們稍後知道哪些除'x'之外,所有列都是常見的。然後,我們可以使用'grepl'來處理它們 –

+0

@ joel.wilson可能工作,不知道 –

0

人們也可以在這裏使用merge借來的:

dfm = merge(df1, df2, by="x", all=T) 
dfm 
    x w y.x y.y z 
1 a 1 NA 1 3 
2 b 2 NA 2 4 
3 b 3 NA 2 4 
4 c 4 1 NA NA 

必填Y可以通過以下方式獲取:

dfm$y = ifelse(is.na(dfm$y.y), dfm$y.x, dfm$y.y) 
dfm[c("w","x","y","z")] 
    w x y z 
1 1 a 1 3 
2 2 b 2 4 
3 3 b 2 4 
4 4 c 1 NA 

編輯: 以上適用於多個常見公司lumns,得到這樣的列名:

> inames = intersect(names(df1), names(df2)) 
> inames 
[1] "address"      "administrative_area_level_1" "administrative_area_level_2" "country"      
[5] "locality"     "neighborhood"    "postal_code"     "route"      
[9] "street_number"    "subpremise"    

由於第一列「地址」已被用於合併,其餘的列用來查找非NA值:

> for(i in inames[2:10]) dfm[[i]] = ifelse(is.na(dfm[[paste0(i,".x")]]), dfm[[paste0(i,".y")]], dfm[[paste0(i,".x")]] ) 

所需的列添加到DFM。

+0

OP有30列左右,如何處理所有這些?通過目測和手工修復? –

+0

我在上面的答案中添加了一個可能的解決方案來處理多個常見列。請發言。 – rnso

相關問題