从MongoDB集合中删除重复数据,
备份数据127.0.0.1数据库中的test 数据库的train_code 集合,备份到 backfile文件,账号为username,密码为password
mongoexport -h 127.0.0.1:7392 -d test -c train_code -o backfile-u username -p password
还原数据
mongoimport -h 127.0.0.1:7392 -d test -c train_code -u username -p password --file backfile
查找重复数据,这里只查找code列重复的数据
mapReduce函数传入了两个函数map和Reduce,map映射函数中调用emit(key,value),将原有集合按照key分组,转换成[key:[value,value]]的结构,这里传入了(this.code,1),代表每个值都是1,返回的结果是["code":[1,1,1...]],Reduce函数负责将结果集合合并成转换成[key:Reduce函数运算结果]的结构
最后的结果会输出到临时表temp_map_reduce
> db.train_code.mapReduce(function(){if(this.code){emit(this.code,1);}},function(key,values){return Array.sum(values
);},{out:"temp_map_reduce"});
{
"result" : "temp_map_reduce",
"timeMillis" : 1295,
"counts" : {
"input" : 8961,
"emit" : 8961,
"reduce" : 29,
"output" : 8932
},
"ok" : 1
}
成功了,现在查找重复的数据,_id代表id,value代表重复的数量
> db.temp_map_reduce.find({value:{$gt:1}});
{ "_id" : "3337", "value" : 2 }
{ "_id" : "3736", "value" : 2 }
{ "_id" : "6920", "value" : 2 }
{ "_id" : "D2270", "value" : 2 }
{ "_id" : "D3709", "value" : 2 }
{ "_id" : "D4698", "value" : 2 }
{ "_id" : "D5654", "value" : 2 }
{ "_id" : "D7755", "value" : 2 }
{ "_id" : "D8246", "value" : 2 }
{ "_id" : "G452", "value" : 2 }
{ "_id" : "G6241", "value" : 2 }
{ "_id" : "G7618", "value" : 2 }
{ "_id" : "G9360", "value" : 2 }
{ "_id" : "K1511", "value" : 2 }
{ "_id" : "K1592", "value" : 2 }
{ "_id" : "K2616", "value" : 2 }
{ "_id" : "K4035", "value" : 2 }
{ "_id" : "K4053", "value" : 2 }
{ "_id" : "K4127", "value" : 2 }
{ "_id" : "K4162", "value" : 2 }
Type "it" for more
> it
{ "_id" : "K4382", "value" : 2 }
{ "_id" : "K4668", "value" : 2 }
{ "_id" : "K7001", "value" : 2 }
{ "_id" : "K7150", "value" : 2 }
{ "_id" : "K864", "value" : 2 }
{ "_id" : "Y667", "value" : 2 }
{ "_id" : "Y670", "value" : 2 }
{ "_id" : "Z322", "value" : 2 }
{ "_id" : "Z4277", "value" : 2 }
另一个简单方法
db.train_code.aggregate([{$group:{_id:"$code",count:{$sum:1}}}, {$match:{count:{$gt:1}}}]);
现在有一个表记录重复的code了,定义一个变量来记录这些code
> var allcode = [];
> db.temp_map_reduce.find({value:{$gt:1}}).forEach(function(d){allcode.push(d._id)});
确认一遍这些code确实是重复数据
db.train_code.find({code:{$in:allcode}}).pretty();
看这一条,果然是重复数据
{ "_id" : ObjectId("587d78cfb217c97c028b45bc"), "code" : "3337", "distance" : "1774公里", "run_time" : "43小时38分", "st
art_station" : "杭州", "end_station" : "成都", "start_time" : "08:37", "end_time" : "04:15", "train_type" : "普快", "act
ive" : 0 }
{ "_id" : ObjectId("587d78cfb217c97c028b45e0"), "code" : "3337", "distance" : "1774公里", "run_time" : "43小时38分", "st
art_station" : "杭州", "end_station" : "成都", "start_time" : "08:37", "end_time" : "04:15", "train_type" : "普快", "act
ive" : 0 }
接下来删除重复值
> var uniqueCode = [],disId = [];
> db.train_code.find({code:{$in:allcode}}).forEach(function(d){
... if(disCode.indexOf(d.code)!==-1){
... disId.push(d._id)
... }else{
... disCode.push(d.code)
... }
... });
> disCode
[
"D2270",
"D8246",
"D4698",
"D3709",
"D7755",
"D5654",
"G9360",
"G7618",
"G452",
"G6241",
"K4162",
"K2616",
"K4127",
"K4035",
"K1592",
"K1511",
"K7150",
"K4053",
"K4382",
"K4668",
"K7001",
"Z4277",
"Z322",
"3337",
"3736",
"6920",
"Y667",
"Y670",
"K864"
]
> disCode.length
29
> disId.length
29
> disId
[
ObjectId("5878a916b217c9615d8b4602"),
ObjectId("5878a94bb217c9445d8b45dd"),
ObjectId("5878a95fb217c9565d8b4593"),
ObjectId("5878a95fb217c9565d8b45d6"),
ObjectId("5878a98eb217c9685d8b459c"),
ObjectId("587c3ef3b217c9ed328b45ce"),
ObjectId("587c42cab217c997348b45a8"),
ObjectId("587c4efcb217c97f378b45ae"),
ObjectId("587c4f62b217c98a378b45c3"),
ObjectId("587c6f4cb217c92c408b459c"),
ObjectId("587c8914b217c948468b458f"),
ObjectId("587c8914b217c948468b4596"),
ObjectId("587c8ca3b217c919478b4575"),
ObjectId("587c9d17b217c98c4d8b45bd"),
ObjectId("587c9d1fb217c9994d8b4620"),
ObjectId("587c9d1fb217c9994d8b4621"),
ObjectId("587c9d1fb217c9994d8b4629"),
ObjectId("587c9d4eb217c9964d8b458c"),
ObjectId("587c9d4eb217c9964d8b45bb"),
ObjectId("587c9d4eb217c9964d8b45eb"),
ObjectId("587c9f63b217c9044e8b4621"),
ObjectId("587d749fb217c964018b45e5"),
ObjectId("587d749fb217c964018b461d"),
ObjectId("587d78cfb217c97c028b45e0"),
ObjectId("587d78cfb217c97c028b461a"),
ObjectId("587d79e0b217c97f028b4599"),
ObjectId("587d79e0b217c97f028b45b7"),
ObjectId("587d79e0b217c97f028b45b8"),
ObjectId("587d9251b217c94a088b45a4")
]
> db.train_code.remove({_id:{$in:disId}});
WriteResult({ "nRemoved" : 29 })
成功删除29条数据