I'm designing system that should be able to process millions of documents and report on them in different ways. mongoDb map\reduce task is what I'm trying to implement (currently doing some investigation on that). The very basic document structure is
db.test.insert(
{
"_id" : ObjectId("4f6063601caf46303c36eb27"),
"verbId" : NumberLong(1506281),
"sentences" : [
{
"sId" : NumberLong(2446630),
"sentiment" : 2,
"categories" : [
NumberLong(3257),
NumberLong(3221),
NumberLong(3291)
]
},
{
"sId" : NumberLong(2446631),
"sentiment" : 0,
"categories" : [
NumberLong(2785),
NumberLong(2762),
NumberLong(2928),
NumberLong(2952)
]
},
{
"sId" : NumberLong(2446632),
"sentiment" : 0,
"categories" : [
NumberLong(-2393)
]
},
{
"sId" : NumberLong(2446633),
"sentiment" : 0,
"categories" : [
NumberLong(-2393)
]
}
]
})
So that each document contains sentences, that could belong to different categories. The report I'm trying to get is number of sentences in category (with percent of verbatims).
I'm doing next map-reduce jobs with finalize method to count different averages.
var map = function() {
var docCategories = new Array();
var catValues = new Array();
for (var i = 0; i < this.sentences.length; i++) { //iterate over sentences.
sentence = this.sentences[i];
for (var j = 0; j < sentence.categories.length; j++) {//iterate over categories
catId= sentence.categories[j].toNumber();
if (docCategories.indexOf(catId) < 0) {
docCategories.push(catId);
catValues.push({sentiment : sentence.sentiment, sentenceCnt: 1});
} else {
categoryIdx = docCategories.indexOf(catId);
catValue = catValues[categoryIdx];
catValue.sentiment = catValue.sentiment + sentence.sentiment;
catValue.sentenceCnt = catValue.sentenceCnt + 1;
}
}
}
totalCount++; //here we do try to count distinctCases see scope.
for (var i = 0; i < docCategories.length; i ++) {
emit(docCategories[i], {count: 1, sentenceCnt: catValues[i].sentenceCnt, sentiment: catValues[i].sentiment, totalCnt : totalCount});
}
};
var reduce = function(key, values) {
var res = {count : 0, sentenceCnt : 0, sentiment : 0};
for ( var i = 0; i < values.length; i ++ ) {
res.count += values[i].count;
res.sentenceCnt += values[i].sentenceCnt;
res.sentiment += values[i].sentiment;
}
return res;
};
var finalize = function(category, values) {
values.sentimentAvg = values.sentiment / values.sentenceCnt;
values.percentOfVerbatim = values.count / totalCount //scope variable (global)
return values;
};
var res = db.runCommand( { mapreduce:'test',
map:map,
reduce:reduce,
out: 'cat_volume',
finalize:finalize,
scope:{totalCount : 0},
});
The most interesting part here is that I'm using totalCount - to count number of verbatims I'm emitting. totalCount is the scope (global) variable. Everything went well on One mongoDb installation, but when going to a shard instances I'm getting "Infinity" for percentOfVerbatim.
Actually in that case totalCount would be just db.test.count() (number of documents) but in future I'm going to add different conditions for documents to be count. Doing any other query is very undesirable since db is very heavy.
Are there any other approaches to using global (scope) variables on multi-instance mongodb installation? Or should I use something else?