MapReduce的麻煩計數

我有一個問題，我在MongoDB的數據，看起來像這樣：MapReduce的麻煩計數

{"miejscowosci_str":"OneCity", "wojewodztwo":"FirstRegionName", "ZIP-Code" : "...", ...} 
{"miejscowosci_str":"TwoCity", "wojewodztwo":"FirstRegionName", "ZIP-Code" : "...", ...} 
{"miejscowosci_str":"ThreeCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "...", ...} 
{"miejscowosci_str":"FourCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "...", ...}

等我想是列出所有地區（wojewodztwo）並計算平均每個區域的郵政編碼號，我知道怎麼算的所有郵政編碼在區域：

var map = function() { 
    emit(this.wojewodztwo,1); 
}; 
var reduce = function(key, val) { 
    var count = 0; 
    for(i in val) { 
     count += val[i]; 
    } 
    return count; 
}; 
db.kodypocztowe.mapReduce(
    map, 
    reduce, 
    { out : "result" } 
);

但我不知道怎麼算的城市（miejscowosci_str）號碼，以便我可以劃分ZIP碼數地區通過同一地區的城市數量。一個城市可以有多個郵政編碼。

你有什麼想法嗎？

來源

2013-05-16 user1337192

我做了幾個假設這裏：

城市可以有多個郵政編碼
郵政編碼是唯一
你是不是試圖獲得答案M101P周5個問題！

而不僅僅是一個去，爲什麼不建立城市/ ZIP對象的列表在地圖相，那麼這降低拉鍊的列表，並獨特的城市在地圖相計數的城市。然後你可以使用finalize階段來計算平均值。

注意：如果數據集很大，你可能要考慮使用替代aggregation framework，這顯示地圖後/減少例如

db.kodypocztowe.drop(); 
db.result.drop(); 

db.kodypocztowe.insert([ 
    {"miejscowosci_str":"OneCity", "wojewodztwo":"FirstRegionName", "ZIP-Code" : "1"}, 
    {"miejscowosci_str":"TwoCity", "wojewodztwo":"FirstRegionName", "ZIP-Code" : "2"}, 
    {"miejscowosci_str":"ThreeCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "3"}, 
    {"miejscowosci_str":"FourCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "4"}, 
    {"miejscowosci_str":"FourCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "5"}, 
]); 

// map the data to { region : [{citiy : name , zip : code }] } 
// Note : a city can be in multiple zips but zips are assumed to be unique 
var map = function() { 
    emit(this.wojewodztwo, {city:this.miejscowosci_str, zip:this['ZIP-Code']}); 
}; 

// 
// convert the data to : 
// 
// {region : {cities: [], zips : []}} 
// 
// note : always add zips 
// note : only add cities if they are not already there 
// 
var reduce = function(key, val) { 
    var res = {zips:[], cities:[]} 
    for(i in val) { 
     var city = val[i].city; 
     res.zips.push(val[i].zip); 
     if(res.cities.indexOf(city) == -1) { 
      res.cities.push(city); 
     } 
    } 
    return res; 
}; 

// 
// finalize the data to get the average number of zips/region 
var finalize = function(key, res) { 
    res.average = res.zips.length/res.cities.length; 
    delete res.cities; 
    delete res.zips; 
    return res; 
} 

print("=============="); 
print(" map/reduce") 
print("=============="); 

db.kodypocztowe.mapReduce(
    map, 
    reduce, 
    { out : "result" , finalize:finalize} 
); 
db.result.find().pretty() 


print("=============="); 
print(" aggregation") 
print("=============="); 

db.kodypocztowe.aggregate([ 
    // get the number of zips/[region,city] 
    { "$group" : 
     { 
      _id : {"region" : "$wojewodztwo", city : "$miejscowosci_str"}, 
      zips:{$sum:1} 
     } 
    }, 
    // get the number of cities per region and sum the number of zips 
    { "$group" : 
     { 
      _id : "$_id.region" , 
      cities:{$sum:1}, 
      zips:{$sum:"$zips"}, 
     } 
    }, 
    // project the data into the same format that map/reduce generated 
    { "$project" : 
     { 
      "value.average":{$divide: ["$zips","$cities"]} 
     } 
    } 
]);

我希望幫助。

來源

2013-05-16 16:41:32 jimoleary

謝謝你確實有效，但我有一些麻煩。我的收藏包含大約21k項目，並且導致我在zipCodes中獲得的大多數區域幾乎沒有空值，在城市也很少有空值，兩個區域有部分zipCodes和城市，但其餘部分僅包含很少的空值。看起來有一些執行時間限制，你知道如何解決它嗎？我想在mapreduce中這樣做，因爲這是我的研究任務。對於基本問題抱歉，我是mongodb的新手。謝謝 – user1337192

嗯，我試着在一個包含29,000個條目的數據集上運行代碼，甚至在map方法中還有5秒的睡眠時間，它仍然沒有超時。我想它更有可能是某處出現異常，日誌中是否有消息？就空值而言，您可以將[查詢]（http://docs.mongodb.org/manual/reference/method/db.collection.mapReduce/#db.collection.mapReduce）添加到** out **參數過濾出不存在$ $存在$的文件並且$ ne null **。運營商是[這裏]（http://docs.mongodb.org/manual/reference/operator/） – jimoleary

我不得不改變一些代碼，這是我的回答：http://pastebin.com/pmPDjHWU，與城市和拉鍊沒有通過[＆]我沒有空，現在沒事 – user1337192

MapReduce的麻煩計數

回答

相關問題