2012-03-14 168 views
18

我一直試圖在MongoDB中使用MapReduce來做我認爲是一個簡單的過程。我不知道這是否正確,如果我甚至應該使用MapReduce。我搜索了一下我想到的關鍵詞,並試圖打開我認爲我會取得最大成功的文檔 - 但沒有。也許我在想這個太過分了?在MongoDB中合併兩個集合

我有兩個類別:detailsgpas

details由一大堆的文件(3+萬美元)。所述studentid元件可以重複兩次,每個year,如下所示:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640100"), "classes" : [1,17,19,21], "studentid" : "12345a", "year" : 1} 
{ "_id" : ObjectId("4d76b7oij7s2d8372v640100"), "classes" : [2,12,19,22], "studentid" : "98765a", "year" : 1} 
{ "_id" : ObjectId("4d49b7oij7s2d8372v640100"), "classes" : [32,91,101,217], "studentid" : "12345a", "year" : 2} 
{ "_id" : ObjectId("4d76b7rty7s2d8372v640100"), "classes" : [1,11,18,22], "studentid" : "24680a", "year" : 1} 
{ "_id" : ObjectId("4d49b7oij7s2d8856v640100"), "classes" : [32,99,110,215], "studentid" : "98765a", "year" : 2} 
... 

gpas具有從details相同studentid的元件。每studentid只有一個入口,這樣的:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "overall" : 97, "subscore": 1} 
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "overall" : 85, "subscore": 5} 
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "overall" : 76, "subscore": 2} 
... 

在我想有一個行集合在此格式每個學生的結尾:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "classes_1": [1,17,19,21], "classes_2": [32,91,101,217], "overall" : 97, "subscore": 1} 
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "classes_1": [2,12,19,22], "classes_2": [32,99,110,215], "overall" : 85, "subscore": 5} 
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "classes_1": [1,11,18,22], "classes_2": [], "overall" : 76, "subscore": 2} 
... 

的辦法,我要做到這一點是通過運行MapReduce的是這樣的:

var mapDetails = function() { 
    emit(this.studentid, {studentid: this.studentid, classes: this.classes, year: this.year, overall: 0, subscore: 0}); 
}; 

var mapGpas = function() { 
    emit(this.studentid, {studentid: this.studentid, classes: [], year: 0, overall: this.overall, subscore: this.subscore}); 
}; 

var reduce = function(key, values) { 
    var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0}; 

    values.forEach(function(value) { 
     if (value.year == 0) { 
      outs.overall = value.overall; 
      outs.subscore = value.subscore; 
     } 
     else { 
      if (value.year == 1) { 
       outs.classes_1 = value.classes; 
      } 
      if (value.year == 2) { 
       outs.classes_2 = value.classes; 
      } 

      outs.studentid = value.studentid; 
     } 
    }); 

    return outs; 

}; 

res = db.details.mapReduce(mapDetails, reduce, {out: {reduce: 'joined'}}) 
res = db.gpas.mapReduce(mapGpas, reduce, {out: {reduce: 'joined'}}) 

但是當我運行它,這是我得到的集合:

{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 97, "subscore" : 1 } } 
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 85, "subscore" : 5 } } 
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } } 

我錯過了類數組。

另外,我怎樣才能訪問得到的MapReduce value元素中的元素? MapReduce總是輸出到value或其他任何名稱嗎?

回答

41

這與MongoDB用戶Google Groups上提出的問題類似。
https://groups.google.com/group/mongodb-user/browse_thread/thread/60a8b683e2626ada?pli=1

答案引用一個在線教程,類似於你的例子: http://tebros.com/2011/07/using-mongodb-mapreduce-to-join-2-collections/

對於MongoDB的MapReduce的更多信息,請參閱文檔: http://www.mongodb.org/display/DOCS/MapReduce

此外,還有是MongoDB Cookbook文章標題爲「使用版本化文檔查找最大值和最小值」的「Extras」部分中MapReduce操作的工作方式的有用循序漸進式演練: http://cookbook.mongodb.org/patterns/finding_max_and_min/

請原諒我,如果你已經閱讀了一些參考文件。我已經將它們包含在其他可能正在閱讀本文的用戶中,並且在MongoDB中使用MapReduce的新功能

Map函數中'emit'語句的輸出與Reduce函數的輸出相匹配非常重要。如果只有一個由Map函數輸出的文檔,則Reduce函數可能根本不會運行,然後您的輸出集合將會有不匹配的文檔。

我稍微修改了你的映射語句,以你想要的輸出格式發射文檔,並有兩個單獨的「類」數組。
我還重寫了reduce語句,以便將新類添加到classes_1和classes_2數組中,但前提是它們尚不存在。

var mapDetails = function(){ 
    var output = {studentid: this.studentid, classes_1: [], classes_2: [], year: this.year, overall: 0, subscore: 0} 
    if (this.year == 1) { 
     output.classes_1 = this.classes; 
    } 
    if (this.year == 2) { 
     output.classes_2 = this.classes; 
    } 
    emit(this.studentid, output); 
}; 

var mapGpas = function() { 
    emit(this.studentid, {studentid: this.studentid, classes_1: [], classes_2: [], year: 0, overall: this.overall, subscore: this.subscore}); 
}; 

var r = function(key, values) { 
    var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0}; 

    values.forEach(function(v){ 
     outs.studentid = v.studentid; 
     v.classes_1.forEach(function(class){if(outs.classes_1.indexOf(class)==-1){outs.classes_1.push(class)}}) 
     v.classes_2.forEach(function(class){if(outs.classes_2.indexOf(class)==-1){outs.classes_2.push(class)}}) 

     if (v.year == 0) { 
      outs.overall = v.overall; 
      outs.subscore = v.subscore; 
     } 
    }); 
    return outs; 
}; 

res = db.details.mapReduce(mapDetails, r, {out: {reduce: 'joined'}}) 
res = db.gpas.mapReduce(mapGpas, r, {out: {reduce: 'joined'}}) 

運行兩個MapReduce的操作結果如下集合,它符合您需要的格式:

> db.joined.find() 
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ 1, 17, 19, 21 ], "classes_2" : [ 32, 91, 101, 217 ], "overall" : 97, "subscore" : 1 } } 
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ 1, 11, 18, 22 ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } } 
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ 2, 12, 19, 22 ], "classes_2" : [ 32, 99, 110, 215 ], "overall" : 85, "subscore" : 5 } } 
> 

MapReduce的總輸出在{_id形式的文件: 「ID」,值:「值「} 有更多的信息可與標題,文檔中的子文檔工作‘點表示法(把手伸進對象)’: http://www.mongodb.org/display/DOCS/Dot+Notation+%28Reaching+into+Objects%29

如果你想的MapReduce的t時的輸出o以不同的格式顯示,您必須在應用程序中以編程方式執行此操作。

希望這會提高您對MapReduce的理解,並讓您更近一步地生成所需的輸出集合。祝你好運!

+0

這非常有幫助。我非常感謝你投入這篇文章。再次感謝! – TFX 2012-03-17 06:58:59

+0

我的榮幸!我很高興能夠幫助!真誠的,馬克 – Marc 2012-03-19 22:17:25

2

您不能使用m/r,因爲該設計僅適用於一個集合。從多個集合中讀取將破壞分片兼容性,因此不被允許。你可以用新的聚合框架(2.1+)做你想做的或者在你的應用程序中做這個。