mongo collection筛选删除重复数据

从MongoDB集合中删除重复数据,

备份数据127.0.0.1数据库中的test 数据库的train_code 集合,备份到 backfile文件,账号为username,密码为password

mongoexport -h 127.0.0.1:7392 -d test -c train_code -o backfile-u username -p password

还原数据

mongoimport -h 127.0.0.1:7392 -d test -c train_code -u username -p password --file backfile

查找重复数据,这里只查找code列重复的数据

mapReduce函数传入了两个函数map和Reduce,map映射函数中调用emit(key,value),将原有集合按照key分组,转换成[key:[value,value]]的结构,这里传入了(this.code,1),代表每个值都是1,返回的结果是["code":[1,1,1...]],Reduce函数负责将结果集合合并成转换成[key:Reduce函数运算结果]的结构

最后的结果会输出到临时表temp_map_reduce

> db.train_code.mapReduce(function(){if(this.code){emit(this.code,1);}},function(key,values){return Array.sum(values

);},{out:"temp_map_reduce"});

{

"result" : "temp_map_reduce",

"timeMillis" : 1295,

"counts" : {

"input" : 8961,

"emit" : 8961,

"reduce" : 29,

"output" : 8932

},

"ok" : 1

}

成功了,现在查找重复的数据,_id代表id,value代表重复的数量

> db.temp_map_reduce.find({value:{$gt:1}});

{ "_id" : "3337", "value" : 2 }

{ "_id" : "3736", "value" : 2 }

{ "_id" : "6920", "value" : 2 }

{ "_id" : "D2270", "value" : 2 }

{ "_id" : "D3709", "value" : 2 }

{ "_id" : "D4698", "value" : 2 }

{ "_id" : "D5654", "value" : 2 }

{ "_id" : "D7755", "value" : 2 }

{ "_id" : "D8246", "value" : 2 }

{ "_id" : "G452", "value" : 2 }

{ "_id" : "G6241", "value" : 2 }

{ "_id" : "G7618", "value" : 2 }

{ "_id" : "G9360", "value" : 2 }

{ "_id" : "K1511", "value" : 2 }

{ "_id" : "K1592", "value" : 2 }

{ "_id" : "K2616", "value" : 2 }

{ "_id" : "K4035", "value" : 2 }

{ "_id" : "K4053", "value" : 2 }

{ "_id" : "K4127", "value" : 2 }

{ "_id" : "K4162", "value" : 2 }

Type "it" for more

> it

{ "_id" : "K4382", "value" : 2 }

{ "_id" : "K4668", "value" : 2 }

{ "_id" : "K7001", "value" : 2 }

{ "_id" : "K7150", "value" : 2 }

{ "_id" : "K864", "value" : 2 }

{ "_id" : "Y667", "value" : 2 }

{ "_id" : "Y670", "value" : 2 }

{ "_id" : "Z322", "value" : 2 }

{ "_id" : "Z4277", "value" : 2 }

另一个简单方法

db.train_code.aggregate([{$group:{_id:"$code",count:{$sum:1}}}, {$match:{count:{$gt:1}}}]);

现在有一个表记录重复的code了,定义一个变量来记录这些code

> var allcode = [];

> db.temp_map_reduce.find({value:{$gt:1}}).forEach(function(d){allcode.push(d._id)});

确认一遍这些code确实是重复数据

db.train_code.find({code:{$in:allcode}}).pretty();

看这一条,果然是重复数据

{ "_id" : ObjectId("587d78cfb217c97c028b45bc"), "code" : "3337", "distance" : "1774公里", "run_time" : "43小时38分", "st

art_station" : "杭州", "end_station" : "成都", "start_time" : "08:37", "end_time" : "04:15", "train_type" : "普快", "act

ive" : 0 }

{ "_id" : ObjectId("587d78cfb217c97c028b45e0"), "code" : "3337", "distance" : "1774公里", "run_time" : "43小时38分", "st

art_station" : "杭州", "end_station" : "成都", "start_time" : "08:37", "end_time" : "04:15", "train_type" : "普快", "act

ive" : 0 }

接下来删除重复值

> var uniqueCode = [],disId = [];

> db.train_code.find({code:{$in:allcode}}).forEach(function(d){

... if(disCode.indexOf(d.code)!==-1){

... disId.push(d._id)

... }else{

... disCode.push(d.code)

... }

... });

> disCode

[

"D2270",

"D8246",

"D4698",

"D3709",

"D7755",

"D5654",

"G9360",

"G7618",

"G452",

"G6241",

"K4162",

"K2616",

"K4127",

"K4035",

"K1592",

"K1511",

"K7150",

"K4053",

"K4382",

"K4668",

"K7001",

"Z4277",

"Z322",

"3337",

"3736",

"6920",

"Y667",

"Y670",

"K864"

]

> disCode.length

29

> disId.length

29

> disId

[

ObjectId("5878a916b217c9615d8b4602"),

ObjectId("5878a94bb217c9445d8b45dd"),

ObjectId("5878a95fb217c9565d8b4593"),

ObjectId("5878a95fb217c9565d8b45d6"),

ObjectId("5878a98eb217c9685d8b459c"),

ObjectId("587c3ef3b217c9ed328b45ce"),

ObjectId("587c42cab217c997348b45a8"),

ObjectId("587c4efcb217c97f378b45ae"),

ObjectId("587c4f62b217c98a378b45c3"),

ObjectId("587c6f4cb217c92c408b459c"),

ObjectId("587c8914b217c948468b458f"),

ObjectId("587c8914b217c948468b4596"),

ObjectId("587c8ca3b217c919478b4575"),

ObjectId("587c9d17b217c98c4d8b45bd"),

ObjectId("587c9d1fb217c9994d8b4620"),

ObjectId("587c9d1fb217c9994d8b4621"),

ObjectId("587c9d1fb217c9994d8b4629"),

ObjectId("587c9d4eb217c9964d8b458c"),

ObjectId("587c9d4eb217c9964d8b45bb"),

ObjectId("587c9d4eb217c9964d8b45eb"),

ObjectId("587c9f63b217c9044e8b4621"),

ObjectId("587d749fb217c964018b45e5"),

ObjectId("587d749fb217c964018b461d"),

ObjectId("587d78cfb217c97c028b45e0"),

ObjectId("587d78cfb217c97c028b461a"),

ObjectId("587d79e0b217c97f028b4599"),

ObjectId("587d79e0b217c97f028b45b7"),

ObjectId("587d79e0b217c97f028b45b8"),

ObjectId("587d9251b217c94a088b45a4")

]

> db.train_code.remove({_id:{$in:disId}});

WriteResult({ "nRemoved" : 29 })

成功删除29条数据


**原创文章未经同意请勿转载**

相关信息