mongoimport -u "username" -p "password" --authenticationDatabase "admin" -d my_db -c mycollection --jsonArray --upsert --upsertFields recipe_id C:/Users/mydata.json
I am using above mongoimport query to import json files to mongo collection and it is working as expected. Here I am using recipe_id as unique filter parameter so while importing if a document have same recipe_id it can skip or replace the data and import only new data.
Now I want to implement the same thing in python so that instead of running it manually i can use it through python script. I tried to use upsert with insert_many method as one of the parameter, But it didn't work.
with open(import_file) as f:
file_data = json.load(f)
new_collection.insert_many(file_data)
client.close()
As I am keep getting this json data on daily basis and this will contain duplicate data for sure, So I want to filter it while importing it to mongodb collection so that only the new data keeps on appending on the db and there won't be data redundancy.
Any suggestion will be very helpfull
sample json data:
[
{
"location_id": 11111,
"recipe_id": "AB8974",
"serving_size_number": 1,
"recipe_fraction_description": null,
"description": "1/2 gallon",
"recipe_name": "ALMOND MILK 32 OZ",
"marketing_name": "Almond Milk",
"marketing_description": null,
"ingredient_statement": "Almond Milk (ALMOND MILK (FILTERED WATER, ALMONDS), CANE SUGAR, CONTAINS 2% OR LESS OF: VITAMIN AND MINERAL BLEND (CALCIUM CARBONATE, VITAMIN E ACETATE, VITAMIN A PALMITATE, VITAMIN D2), SEA SALT, SUNFLOWER LECITHIN, LOCUST BEAN GUM, GELLAN GUM.)",
"allergen_attributes": {
"allergen_statement_not_available": null,
"contains_shellfish": "NO",
"contains_peanut": "NO",
"contains_tree_nuts": "YES",
"contains_milk": "NO",
"contains_wheat": "NO",
"contains_soy": "NO",
"contains_eggs": "NO",
"contains_fish": "NO",
"contains_added_msg": "UNKNOWN",
"contains_hfcs": "UNKNOWN",
"contains_mustard": "UNKNOWN",
"contains_celery": "UNKNOWN",
"contains_sesame": "UNKNOWN",
"contains_red_yellow_blue_dye": "UNKNOWN",
"gluten_free_per_fda": "UNKNOWN",
"non_gmo_claim": "UNKNOWN",
"contains_gluten": "NO"
},
"dietary_attributes": {
"vegan": "YES",
"vegetarian": "YES",
"kosher": "YES",
"halal": "UNKNOWN"
},
"primary_attributes": {
"protein": 7.543,
"total_fat": 19.022,
"carbohydrate": 69.196,
"calories": 463.227,
"total_sugars": 61.285,
"fiber": 5.81,
"calcium": 3840.228,
"iron": 3.955,
"potassium": 270.768,
"sodium": 1351.208,
"cholesterol": 0.0,
"trans_fat": 0.0,
"saturated_fat": 1.488,
"monounsaturated_fat": 11.743,
"polyunsaturated_fat": 4.832,
"calories_from_fat": 171.195,
"pct_calories_from_fat": 36.957,
"pct_calories_from_saturated_fat": 2.892,
"added_sugars": null,
"vitamin_d_(mcg)": null
},
"secondary_attributes": {
"ash": null,
"water": null,
"magnesium": 120.654,
"phosphorous": 171.215,
"zinc": 1.019,
"copper": 0.183,
"manganese": null,
"selenium": 1.325,
"vitamin_a_(IU)": 5331.357,
"vitamin_a_(RAE)": null,
"beta_carotene": null,
"alpha_carotene": null,
"vitamin_e_(A-tocopherol)": 49.909,
"vitamin_d_(IU)": null,
"vitamin_c": 0.0,
"thiamin_(B1)": 0.0,
"riboflavin_(B2)": 0.449,
"niacin": 0.979,
"pantothenic_acid": 0.061,
"vitamin_b6": 0.0,
"folacin_(folic_acid)": null,
"vitamin_b12": 0.0,
"vitamin_k": null,
"folic_acid": null,
"folate_food": null,
"folate_DFE": null,
"vitamin_a_(RE)": null,
"pct_calories_from_protein": 6.514,
"pct_calories_from_carbohydrates": 59.751,
"biotin": null,
"niacin_(mg_NE)": null,
"vitamin_e_(IU)": null
}
}
]
Here is the sample json data which I will be getting on daily basis. And in this recipe ID will be unique, There will be chances that same recipe is appearing on every file along with some new recipe. So want to append the new recipes only if it exist.
Below is the sample of database structure
> db.my_col1.findOne()
{
"_id" : ObjectId("5f20c8cc1cd23262e7c28e88"),
"location_id" : 11111,
"recipe_id" : "AB8974",
"serving_size_number" : 1,
"recipe_fraction_description" : null,
"description" : "1/2 gallon",
"recipe_name" : "ALMOND MILK 32 OZ",
"marketing_name" : "Almond Milk",
"marketing_description" : null,
"ingredient_statement" : "Almond Milk (ALMOND MILK (FILTERED WATER, ALMONDS), CANE SUGAR, CONTAINS 2% OR LESS OF: VITAMIN AND MINERAL BLEND (CALCIUM CARBONATE, VITAMIN E ACETATE, VITAMIN A PALMITATE, VITAMIN D2), SEA SALT, SUNFLOWER LECITHIN, LOCUST BEAN GUM, GELLAN GUM.)",
"allergen_attributes" : {
"allergen_statement_not_available" : null,
"contains_shellfish" : "NO",
"contains_peanut" : "NO",
"contains_tree_nuts" : "YES",
"contains_milk" : "NO",
"contains_wheat" : "NO",
"contains_soy" : "NO",
"contains_eggs" : "NO",
"contains_fish" : "NO",
"contains_added_msg" : "UNKNOWN",
"contains_hfcs" : "UNKNOWN",
"contains_mustard" : "UNKNOWN",
"contains_celery" : "UNKNOWN",
"contains_sesame" : "UNKNOWN",
"contains_red_yellow_blue_dye" : "UNKNOWN",
"gluten_free_per_fda" : "UNKNOWN",
"non_gmo_claim" : "UNKNOWN",
"contains_gluten" : "NO"
},
"dietary_attributes" : {
"vegan" : "YES",
"vegetarian" : "YES",
"kosher" : "YES",
"halal" : "UNKNOWN"
},
"primary_attributes" : {
"protein" : 7.543,
"total_fat" : 19.022,
"carbohydrate" : 69.196,
"calories" : 463.227,
"total_sugars" : 61.285,
"fiber" : 5.81,
"calcium" : 3840.228,
"iron" : 3.955,
"potassium" : 270.768,
"sodium" : 1351.208,
"cholesterol" : 0,
"trans_fat" : 0,
"saturated_fat" : 1.488,
"monounsaturated_fat" : 11.743,
"polyunsaturated_fat" : 4.832,
"calories_from_fat" : 171.195,
"pct_calories_from_fat" : 36.957,
"pct_calories_from_saturated_fat" : 2.892,
"added_sugars" : null,
"vitamin_d_(mcg)" : null
},
"secondary_attributes" : {
"ash" : null,
"water" : null,
"magnesium" : 120.654,
"phosphorous" : 171.215,
"zinc" : 1.019,
"copper" : 0.183,
"manganese" : null,
"selenium" : 1.325,
"vitamin_a_(IU)" : 5331.357,
"vitamin_a_(RAE)" : null,
"beta_carotene" : null,
"alpha_carotene" : null,
"vitamin_e_(A-tocopherol)" : 49.909,
"vitamin_d_(IU)" : null,
"vitamin_c" : 0,
"thiamin_(B1)" : 0,
"riboflavin_(B2)" : 0.449,
"niacin" : 0.979,
"pantothenic_acid" : 0.061,
"vitamin_b6" : 0,
"folacin_(folic_acid)" : null,
"vitamin_b12" : 0,
"vitamin_k" : null,
"folic_acid" : null,
"folate_food" : null,
"folate_DFE" : null,
"vitamin_a_(RE)" : null,
"pct_calories_from_protein" : 6.514,
"pct_calories_from_carbohydrates" : 59.751,
"biotin" : null,
"niacin_(mg_NE)" : null,
"vitamin_e_(IU)" : null
}
}
Either add a unique index on the recipe.recipe_id field, or code around it by checking if a matching document exists or not:
with open(import_file) as f:
file_data = json.load(f)
for data in file_data:
recipe_id = get('recipe_id') # Add error checking if any chance these fields don't exist
if db.mycollection.find_one({'recipe_id': recipe_id}) is None:
db.mycollection.insert_one(data)