Problems with version control for dictionaries inside a python class - python

I'm doing something wrong in the code below. I have a method (update_dictonary) that changes a value or values in a dictionary based on what is specificed in a tuple (new_points).
Before I update the dictionary, I want to save that version in a list (history) in order to be able to access previous versions. However, my attempt below updates all dictionaries in history to be like the latest version.
I can't figure out what I'm doing wrong here.
test_dict = {'var0':{'var1':{'cond1':1,
'cond2':2,
'cond3':3}
}
}
class version_control:
def __init__ (self, dictionary):
self.po = dictionary
self.history = list()
self.version = 0
def update_dictionary(self, var0, var1, new_points):
po_ = self.po
self.history.append(po_)
for i in new_points:
self.po[var0][var1][i[0]] = i[1]
self.version += 1
def get_history(self, ver):
return self.history[ver]
a = version_control(test_dict)
new_points = [('cond1', 2),
('cond2', 0)]
a.update_dictionary('var0', 'var1', new_points)
new_points = [('cond3', -99),
('cond2', 1)]
a.update_dictionary('var0', 'var1', new_points)
print(a.get_history(0))
print(a.get_history(1))

Try this
from copy import deepcopy
...
def update_dictionary(self, var0, var1, new_points):
po_ = deepcopy(self.po)
self.history.append(po_)
for i in new_points:
self.po[var0][var1][i[0]] = i[1]
self.version += 1
...
The problem here is that when you assign po_= self.po you expect po_ to a new variable with a new memory id but actually, you just make a shallow copy(same memory id) of your dictionary. This means if you update the self.po then op_ will automatically update.
To solve this problem by using deepcopy from the copy module(Built-in). It will create a new variable.
You can use this code to save the data into a JSON file.
import json
class version_control:
def __init__(self, dictionary):
self.po = dictionary
self.version = 0
self.ZEROth_version()
def update_dictionary(self, var0, var1, new_points, version=None):
self.version += 1
for i in new_points:
self.po[var0][var1][i[0]] = i[1]
# set the ver to version if given else set to self.version
ver = self.version if version is None else version
with open("version.json", "r") as jsonFile:
# loading the data from the file.
data = json.load(jsonFile)
data[str(ver)] = self.po
with open("version.json", "w") as jsonFile:
# save the updated dictionary in json file
json.dump(data, jsonFile, indent=4)
def get_history(self, ver):
try:
with open("version.json", "r") as jsonFile:
# I don't use .get here. I will catch key errors in except block. I don't want to add an if statement to check for None. But string if you want you can add that.
return json.load(jsonFile)[str(ver)]
# Checking if the file not exists or is empty
except (json.decoder.JSONDecodeError, FileNotFoundError, KeyError) as e:
print("File or Version not found")
def ZEROth_version(self):
with open("version.json", "w") as f:
data = {0: self.po}
json.dump(data, f, indent=4)
I have explained some main points if you want more explanation then comment, I will reply as soon as possible.

Related

Convert complex nested JSON to csv using python

I got a complex nested JSON file and I need to convert it completely, but my code can only convert part of the information,
How can I modify my code, or do you guys have a better code?
my json file
import csv
import json
import sys
import codecs
def trans(path):
jsonData = codecs.open('H://14.json', 'r', 'utf-8')
# csvfile = open(path+'.csv', 'w')
# csvfile = open(path+'.csv', 'wb')
csvfile = open('H://11.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csvfile, delimiter=',')
flag = True
for line in jsonData:
dic = json.loads(line)
if flag:
keys = list(dic.keys())
print(keys)
writer.writerow(keys)
flag = False
writer.writerow(list(dic.values()))
jsonData.close()
csvfile.close()
if __name__ == '__main__':
path=str(sys.argv[0])
print(path)
trans(path)
my json file
{"id":"aa","sex":"male","name":[{"Fn":"jeri","Ln":"teri"}],"age":45,"info":[{"address":{"State":"NY","City":"new york"},"start_date":"2001-09","title":{"name":"Doctor","Exp":"head"},"year":"2001","month":"05"}],"other":null,"Hobby":[{"smoking":null,"gamble":null}],"connect":[{"phone":"123456789","email":"info#gmail.com"}],"Education":"MBA","School":{"State":"NY","City":"new york"}}
{"id":"aa","sex":"female","name":[{"Fn":"lo","Ln":"li"}],"age":34,"info":[{"address":{"State":"NY","City":"new york"},"start_date":"2008-11","title":{"name":"Doctor","Exp":"hand"},"year":"2008","month":"02"}],"other":null,"Hobby":[{"smoking":null,"gamble":null}],"connect":[{"phone":"123456789","email":"info#gmail.com"}],"Education":"MBA","School":{"State":"NY","City":"new york"}}
It only converts part of the information, 'name''info''Hobby''connect''School' these information are not converted,I need to convert all information completely,
You could use the below function to treat each dic. It will flatten the dict by recursive calls until there is no dict or list in the values. In order to avoid issues with 2 keys having the same name, I concatenate with the previous level.
WARNING: it is based on your format so if you have lists with more than one element in the middle, it will only take care of the first element.
def flatten_dict(input_dict, result = None):
result = result or {}
for key, value in input_dict.items():
if isinstance(value, list):
current_dict = {key+"_"+k: v for k, v in value[0].items()}
flatten_dict(current_dict, result)
elif isinstance(value, dict):
current_dict = {key+"_"+k: v for k, v in value.items()}
flatten_dict(current_dict, result)
else:
result[key] = value
return result
Then apply this function on each dic, transform to Dataframe and save as CSV.
res = []
for line in jsonData:
dic = json.loads(line)
res.append(flatten_dict(dic))
res_df = pd.DataFrame(res)
res_df.to_csv("result.csv")
Result:

Python - "Converting Nested Json to CSV Function" can't find its attribute for Converting Nested Json

I am trying to create a function called "JsonTransformer" on Jupyter notebookin order to convert JSON data to csv format. I firstly saved it on Jupyter Notebook and tried to import it and run it for testing with a json file.
Below the codes which I used and the whole code works without any problem and could get csv files. However, whenever I try to import the created function on a new Jupyter notebook; my function can't find its "Json_to_dataframe" attribute.
import pandas as pd
import json
from copy import deepcopy
class JsonCsvConverter:
def cross_join(self, left, right):
new_rows = [] if right else left
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_rem, list):
yield from flatten_list(elem)
else:
yield elem
def flatten_list(self, data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def json_to_dataframe(data_in):
def flatten_json(data, prev_heading=''):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_json(value, prev_heading + '.' + key))
elif isinstance(data, list):
rows = []
for i in range(len(data)):
[rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
else:
rows = [{prev_heading[1:]: data}]
return rows
return pd.DataFrame(flatten_json(data_in))
import JsonTransformer
if __name__ == '__main__':
with open('cnntv_json') as json_file:
json_data = json.load(json_file)
df = JsonTransformer.json_to_dataframe(json_data)
df.to_csv('cnntv.csv', mode='w')
However; I always get the error: "Module JsonTransformer" has no attribute 'json_to_dataframe' when I try to import the function on a new Jupyter notebook page; it seems that it can't see its attribute.
How can I solve this problem? Would be glad if someone can help me. Thanks.

Safer or more pythonic way to save python kwargs back to yaml?

This is a heavily abstracted example where I build objects from variables stored in a .yaml file. I'm writing the reverse method to save them back as a new .yaml
I may create further objects via script, so the output yaml will in general be different.
I'm using .locals() to build a dictionary from the kwargs, and then .pop() to strip the ones I will not want to save.
This seems to work and do what I want, but I feel it is ugly. Am I missing a better, safer, or more pythonic way to do this?
I understand there is pickle and dill, but for the current question I'd like to restrict this to reading and writing yamls. (because)
note: if attributes are added later I don't want them saved. This is why I create ob.L right after instantiation.
Input .yaml:
bob:
args: {'x':1, 'y':2}
sue:
args: {'x':3, 'y':4}
Output .yaml:
bob:
args:
x: 1
y: 2
new:
args:
x: 5
y: 6
sue:
args:
x: 3
y: 4
Current script:
class A(object):
wow = 77
def __init__(self, name, x, y):
self.name = name
self.x = x
self.y = y
self.L = locals()
self.L.pop('name')
self.L.pop('self')
import yaml
with open('thing.yaml', 'r') as infile:
d = yaml.load(infile)
obs = []
for name, info in d.items():
ob = A(name, **info['args'])
obs.append(ob)
newob = A('new', 5, 6)
obs.append(newob)
newob.ignore_me = 777 # this should not be saved
# rebuild the yaml
d = dict()
for ob in obs:
info = dict()
info['args'] = ob.L
d[ob.name] = info
with open('newthing.yaml', 'w') as outfile:
yaml.dump(d, outfile, default_flow_style=False, allow_unicode=True)
I can't understand why you're doing any of this. All you need to do is to load the YAML, add your new items, and then dump it again.
with open('thing.yaml', 'r') as infile:
d = yaml.load(infile)
d['new'] = {'x': 5, 'y': 6}
with open('newthing.yaml', 'w') as outfile:
yaml.dump(d, outfile, default_flow_style=False, allow_unicode=True)

Most effective way to parse CSV and take action based on content of row

I have a CSV file that Splunk generates, similar in format to the following:
Category,URL,Hash,ID,"__mv_Hash","_mkv_ID"
binary,somebadsite.com/file.exe,12345abcdef,123,,,
callback,bad.com,,567,,,
What I need to do is iterate through the CSV file, maintaining header order, and take a different action if the result is a binary or callback. For this example, if the result is a binary I'll return an arbitrary "clean" or "dirty" rating and if it's a callback I'll just print out the details.
Below is the code I'm currently planning to use, but I'm new to Python and would like feedback on the code and if there is a better way to accomplish this. I'm also not fully clear on the difference between how I'm handling if the result is binary: for k in (k for k in r.fieldnames if (not k.startswith("""__mv_""") and not k.startswith("""_mkv_"""))) and how I handle if it's not. Both achieve the same result, so whats the benefit of one over the other?
import gzip
import csv
import json
csv_file = 'test_csv.csv.gz'
class GZipCSVReader:
def __init__(self, filename):
self.gzfile = gzip.open(filename)
self.reader = csv.DictReader(self.gzfile)
self.fieldnames = self.reader.fieldnames
def next(self):
return self.reader.next()
def close(self):
self.gzfile.close()
def __iter__(self):
return self.reader.__iter__()
def get_rating(hash):
if hash == "12345abcdef":
rating = "Dirty"
else:
rating = "Clean"
return hash, rating
def print_callback(result):
print json.dumps(result, sort_keys=True, indent=4, separators=(',',':'))
def process_results_content(r):
for row in r:
values = {}
values_misc = {}
if row["Category"] == "binary":
# Iterate through key:value pairs and add to dictionary
for k in (k for k in r.fieldnames if (not k.startswith("""__mv_""") and not k.startswith("""_mkv_"""))):
v = row[k]
values[k] = v
rating = get_rating(row["Hash"])
if rating[1] == "Dirty":
print rating
else:
for k in r.fieldnames:
if not k.startswith("""__mv_""") and not k.startswith("""_mkv_"""):
v = row[k]
values_misc[k] = v
print_callback(values_misc)
r.close()
if __name__ == '__main__':
r = GZipCSVReader(csv_file)
process_results_content(r)
Finally, would a for...else loop be better rather than doing something such as if row["Category"] == "binary"? For example, could I do something such as:
def process_results_content(r):
for row in r:
values = {}
values_misc = {}
for k in (k for k in r.fieldnames if (not row["Category"] == "binary")):
v = row[k]
...
else:
v = row[k]
...
Seems like that would be the same logic where the first clause would capture anything not binary and the second would capture everything else, but does not seem to produce the correct result.
My take using the pandas library.
Code:
import pandas as pd
csv_file = 'test_csv.csv'
df = pd.read_csv(csv_file)
df = df[["Category","URL","Hash","ID"]] # Remove the other columns.
get_rating = lambda x: "Dirty" if x == "12345abcdef" else "Clean"
df["Rating"] = df["Hash"].apply(get_rating) # Assign a value to each row based on Hash value.
print df
j = df.to_json() # Self-explanatory. :)
print j
Result:
Category URL Hash ID Rating
0 binary somebadsite.com/file.exe 12345abcdef 123 Dirty
1 callback bad.com NaN 567 Clean
{"Category":{"0":"binary","1":"callback"},"URL":{"0":"somebadsite.com\/file.exe","1":"bad.com"},"Hash":{"0":"12345abcdef","1":null},"ID":{"0":123,"1":567},"Rating":{"0":"Dirty","1":"Clean"}}
If this is your intended result, then just substitute the above to your GZipReader, since I did not emulate the opening of the gzip file.

New Class doesn't have a setItem Method

So, this is my code.
def classMaker(csv):
csv = csv.split("/n")
firstLine = csv[0]
csv = csv[1:]
class newClass():
def __init__(self, line):
self.vars = firstLine
for i in range(len(line)):
self[firstLine[i]] = line[i]
return [newClass(line) for line in csv]
The problem is an AttributeError in self[firstLine[i]] = line[i]. It says
AttributeError: newClass instance has no attribute '__setitem__'
I don't know why it is causing this error. My goal is to take in a csv file exported from Excel and auto-generate object names from field names.
Thank you in advance.
You can avoid the newClass all together if you use collections.namedtuple:
CSVRow = namedtuple("CSVRow", firstLine)
return [CSVRow(*line) for line in csv]
This assumes that the CSV headers will be valid Python identifiers (that is, if you have entires like "Some Value" this won't work if you don't process firstLine.
This will let you do things like this:
# Let's assume your CSV has a Name field
# and that it is the first column
csv_data[3].Name == csv_data[3][0]
# True
Also, you should look into the csv module to simplify CSV processing.
If I can infer your intent correctly, you want to replace this line:
self[firstLine[i]] = line[i]
with this:
setattr(self, firstline[i], line[i])
This will create an attribute of your newClass object named after the column in your data.
E.g.:
Name, Date, Weight
Joe, 23-Sep, 99
...
and
data = classMaker('file.csv')
will produce :
data[0].Name == 'Joe'
P.s. I assume that you will add file I/O, parsing the CSV file, and other missing elements.
P.p.s: You can avoid the loop counter i altogether:
for attr, val in zip(firstLine, line):
setattr(self, attr, val)
P.p.s: Here is a complete working sample:
import csv
def classMaker(filename):
class newClass(object):
def __init__(self, line):
for attr, val in zip(firstLine, line):
setattr(self, attr, val)
with open(filename, 'rb') as csvfile:
spamreader = csv.reader(csvfile)
firstLine = spamreader.next()
return [newClass(line) for line in spamreader]
x = classMaker("/tmp/x.csv")
print x[0].Name

Categories

Resources