"Remove" in else clause changes results of loop over json dict - python

I am iterating over a dict created from a json file which works fine but as soon as I remove some of the entries in the else clause the results change (normally it prints 35 nuts_ids but with the remove in the else only 32 are printed. So it seems that the remove influences the iterating but why? The key should be safe? How can I do this appropriately without loosing data?
import json
with open("test.json") as json_file:
json_data = json.load(json_file)
for g in json_data["features"]:
poly = g["geometry"]
cntr_code = g["properties"]["CNTR_CODE"]
nuts_id = g["properties"]["NUTS_ID"]
name = g["properties"]["NUTS_NAME"]
if cntr_code == "AT":
print(nuts_id)
# do plotting etc
else: # delete it if it is not part a specific country
json_data["features"].remove(g) # line in question
# do something else with the json_data

Not a good practice to delete items while iterating the object. Instead you can try filtering out the elements you do need.
Ex:
import json
with open("test.json") as json_file:
json_data = json.load(json_file)
json_data_features = [g for g in json_data["features"] if g["properties"]["CNTR_CODE"] == "AT"] #Filter other country codes.
json_data["features"] = json_data_features
for g in json_data["features"]:
poly = g["geometry"]
cntr_code = g["properties"]["CNTR_CODE"]
nuts_id = g["properties"]["NUTS_ID"]
name = g["properties"]["NUTS_NAME"]
# do plotting etc
# do something else with the json_data

Always remember the cardinal rule, never modify objects you are iterating on
You can take a copy of your dictionary and then iterate on it using copy.copy
import json
import copy
with open("test.json") as json_file:
json_data = json.load(json_file)
#Take copy of json_data
json_data_copy = json_data['features'].copy()
#Iterate on the copy
for g in json_data_copy:
poly = g["geometry"]
cntr_code = g["properties"]["CNTR_CODE"]
nuts_id = g["properties"]["NUTS_ID"]
name = g["properties"]["NUTS_NAME"]
if cntr_code == "AT":
print(nuts_id)
# do plotting etc
else: # delete it if it is not part a specific country
json_data["features"].remove(g) # line in question

Related

Handle index position in Python script to delete json objects from json file - Resolved

I have a file (my_file.json) has contents as below;
[
{
"use":"abcd",
"contact":"xyz",
"name":"my_script.py",
"time":"11:22:33"
},
{
"use":"abcd"
"contact":"xyz",
"name":"some_other_script.py",
"time":"11:22:33"
},
{
"use":"apqwkndf",
"contact":"xyz",
"name":"my_script.py",
"time":"11:22:33"
},
{
"use":"kjdshfjkasd",
"contact":"xyz",
"name":"my_script.py",
"time":"11:22:33"
}
]
I used following python code to delete the objects that has "name":"my_script.py",
#!/bin/usr/python
impoty json
obj = json.load(open("my_file.json"))
index_list = []
for i in xrange(len(obj)):
if obj[i]["name"] == ["my_script.py"]
index_list.append(i)
for x in range(len(index_list)):
obj.pop(index_list[x])
open("output_my_file.json","w".write(json.dumps(obj, indent=4, separators=(',',': ')))
but it seems I am stuck, because after popping an index the index position in actual obj gets changed, which leads to wrong index deletion or sometimes pop index gets out of range. Any other solution?
Try popping in reverse order:
for x in reversed(range(len(index_list))):
This will create a new list and assign only those without "name": "my_script.py" to the new list.
obj = [i for i in obj if i["name"] != "my_script.py"]
import json
with open('my_file.json') as f:
data = json.load(f)
data = [item for item in data if item.get('name') != 'my_script.py']
with open('output_my_file.json', 'w') as f:
json.dump(data, f, indent=4)
Try:
import json
json_file = json.load(open("file.json"))
for json_dict in json_file:
json_dict.pop("name",None)
print(json.dumps(json_file, indent=4))
You don't need the last line where it says 'json.dumps' I just have it there so it looks more readable when printed.
As a general rule of thumb, you usually don't want to ever change an iterable while iterating over it.
I suggest you save the elements you do want in the first loop:
import json
with open('path/to/file', 'r') as f:
data = json.load(f)
items_to_keep = []
for item in data:
if item['name'] != 'my_script.py':
items_to_keep.append(item)
with open('path/to/file', 'w') as f:
json.dump(items_to_keep, f, ...)
The filtering can be reduced into a single line (called list-comprehension)
import json
with open('path/to/file', 'r') as f:
data = json.load(f)
items_to_keep = [item for item in data if item['name'] != 'my_script.py']
with open('path/to/file', 'w') as f:
json.dump(items_to_keep, f, ...)

How to create variables to loop over files and emerge in dataframe?

I want to create a DataFrame with data for Tennis matches of a specific player 'Lenny Hampel'. For this I downloaded a lot of .json files with data for of his matches - all in all there are around 100 files. As it is a json file i need to convert every single file into a dict, to get it into the dataframe in the end. Finally I need to concatenate each file to the dataframe. I could hard-code it, however it is kind of silly I think, but I could not find a proper way to iterate trough this.
Could you help me understand how I could create a loop or smth else in order to code it the smart way?
from bs4 import BeautifulSoup
import requests
import json
import bs4 as bs
import urllib.request
from urllib.request import Request, urlopen
import pandas as pd
import pprint
with open('lenny/2016/lenny2016_match (1).json') as json_file:
lennymatch1 = json.load(json_file)
player = [item
for item in lennymatch1["stats"]
if item["player_fullname"] == "Lenny Hampel"]
with open('lenny/2016/lenny2016_match (2).json') as json_file:
lennymatch2 = json.load(json_file)
player2 = [item
for item in lennymatch2["stats"]
if item["player_fullname"] == "Lenny Hampel"]
with open('lenny/2016/lenny2016_match (3).json') as json_file:
lennymatch3 = json.load(json_file)
player33 = [item
for item in lennymatch3["stats"]
if item["player_fullname"] == "Lenny Hampel"]
with open('lenny/2016/lenny2016_match (4).json') as json_file:
lennymatch4 = json.load(json_file)
player4 = [item
for item in lennymatch4["stats"]
if item["player_fullname"] == "Lenny Hampel"]
tabelle1 = pd.DataFrame.from_dict(player)
tabelle2 = pd.DataFrame.from_dict(player2)
tabelle3 = pd.DataFrame.from_dict(player33)
tabelle4 = pd.DataFrame.from_dict(player4)
tennisstats = [tabelle1, tabelle2, tabelle3, tabelle4]
result = pd.concat(tennisstats)
result
Well, it seems so basic knowledge that I don't understand why you ask for this.
# --- before loop ---
tennisstats = []
# --- loop ---
for filename in ["lenny/2016/lenny2016_match (1).json", "lenny/2016/lenny2016_match (2).json"]:
with open(filename) as json_file:
lennymatch = json.load(json_file)
player = [item
for item in lennymatch["stats"]
if item["player_fullname"] == "Lenny Hampel"]
tabele = pd.DataFrame.from_dict(player)
tennisstats.append(tabele)
# --- after loop ---
result = pd.concat(tennisstats)
If filenames are similar and they have only different number
for number in range(1, 101):
filename = f"lenny/2016/lenny2016_match ({number}).json"
with open(filename) as json_file:
and rest is the same as in first version.
If all files are in the same folder then maybe you should use os.listdir()
directory = "lenny/2016/"
for name in os.listdir(directory):
filename = directory + name
with open(filename) as json_file:
and rest is the same as in first version.

Creating objects and adding them to a list by Reading CSV input in python

This is currently my code for reading through a CSV file, Creating a person object, and adding each person to a list. One line Example input: John,Langley,1,2,2,3,5
When i print(per) each time after creating a person object. My output is correct, but as soon as i add that person to the list i made, the numeric values AKA 'traits' for that person are all the same as the last persons traits in the CSV file.
For Example:
John,Langley,1,2,2,3,5 --(add to list)-->John,Langley,1,1,1,1,1
Isabel,Smith,3,2,4,4,0 --(add to list)-->Isabel,Smith,1,1,1,1,1
John,Doe,1,1,1,1,1 --(add to list)-->John,Doe,1,1,1,1,1
This is impacting me with continuing because i need the person objects' traits to be valid in order to perform analysis on them in the next couple methods. PLEASE IGNORE MY PRINT STATEMENTS. THEY WERE FOR MY DEBUGGING PURPOSES
def read_file(filename):
file = open(filename, "r", encoding='utf-8-sig')
Traits_dict = {}
pl = []
next(file)
for line in file:
line = line.rstrip('\n')
line = line.split(',')
first = str(line[0].strip())
last = str(line[1].strip())
w = line[2].strip()
hobby = line[3].strip()
social = line[4].strip()
eat = line[5].strip()
sleep = line[6].strip()
Traits_dict["Work"] = w
Traits_dict["Hobbies"] = hobby
Traits_dict["Socialize"] = social
Traits_dict["Eat"] = eat
Traits_dict["Sleep"] = sleep
per = Person(first, last, Traits_dict)
print(per)
pl.append(per)
print(pl[0])
print(pl[1])
print(pl[2])
print(pl[3])
print(pl[4])
return pl
All the Traits_dict = {} are the same to all object since you initiating the dict before the loop so it's giving each Person object the same dict reference in it.
You can put the Traits_dict = {} inside the loop that it will create each Person a new dict
for line in file:
Traits_dict = {}

list.append() is replacing every variable to new one

I have loop in which I edit a json object and append it to a list. But outside the loop, the value of all old elements gets changed to the new one
My question is similar to this one here, but I still cant find a solution to my problem.
This is my Code:
json_data = open(filepath).read()
data = json.loads(json_data)
dataNew=[]
#opening file to write json
with open(filepath2, 'w') as outfile:
for i in range(50):
random_index_IntentNames = randint(0,len(intent_names)-1)
random_index_SessionIds = randint(0,len(session_id)-1)
timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
data["result"]["metadata"]["intentName"] = intent_names[random_index_IntentNames]
data["sessionId"]=session_id[random_index_SessionIds]
data["timestamp"] = timestamp
dataNew.append(data)
json.dump(dataNew, outfile, indent=2)
Every item in your list is just a reference to a single object in memory. Similar to what was posted in your linked answer, you need to append copies of the dict.
import copy
my_list = []
a = {1: 2, 3: 4}
b = a # Referencing the same object
c = copy.copy(a) # Creating a different object
my_list.append(a)
my_list.append(b)
my_list.append(c)
a[1] = 'hi' # Modify the dict, which will change both a and b, but not c
print my_list
You might be interested in Is Python call-by-value or call-by-reference? Neither. for further reading.
data is a dict, which means it's mutable and it's value is passed by reference, you have to use [copy.deepcopy()]
(https://docs.python.org/2/library/copy.html#copy.deepcopy) if you want to keep origin data not muted:
from copy import deepcopy
json_data = open(filepath).read()
data = json.loads(json_data)
dataNew=[]
#opening file to write json
with open(filepath2, 'w') as outfile:
for i in range(50):
random_index_IntentNames = randint(0,len(intent_names)-1)
random_index_SessionIds = randint(0,len(session_id)-1)
timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
# Create a shallow copy, modify it and append to new
new_data = deepcopy(data)
new_data["result"]["metadata"]["intentName"] = intent_names[random_index_IntentNames]
new_data["sessionId"]=session_id[random_index_SessionIds]
new_data["timestamp"] = timestamp
dataNew.append(new_data)
json.dump(dataNew, outfile, indent=2)
NOTE: If data dosn't store mutable items you can use dict.copy in order to avoid modifying origin value.
Good Luck!
I was able to find the solution myself. I gave the assignment of "data" within the loop and it worked:
json_data = open(filepath).read()
dataNew=[]
#opening file to write json
with open(filepath2, 'w') as outfile:
for i in range(50):
random_index_IntentNames = randint(0,len(intent_names)-1)
random_index_SessionIds = randint(0,len(session_id)-1)
timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
data = json.loads(json_data)
data["result"]["metadata"]["intentName"] = intent_names[random_index_IntentNames]
data["sessionId"]=session_id[random_index_SessionIds]
data["timestamp"] = timestamp
dataNew.append(data)
json.dump(dataNew, outfile, indent=2)

Read CSV file and filter results

Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()

Categories

Resources