how to extract columns for dictionary that do not have keys - python

so I have tried resources of how transform dict in data frame, but the problem this is an weird Dict.
it is not like key: {} , key: {} and etc..
the data has lots of items. But the goal is extract only the stuff inside of dict {}, if possible the dates also is a plus.
data:
id,client,source,status,request,response,queued,created_at,updated_at
54252,sdf,https://asdasdadadad,,"{
"year": "2010",
"casa": "aca",
"status": "p",
"Group": "57981",
}",,1,"2020-05-02 11:06:17","2020-05-02 11:06:17"
54252,msc-lp,https://discover,,"{
"year": "27",
"casa": "Na",
"status": "p",
"Group": "57981",
}"
my attempts:
#attempt 1
with open('data.csv') as fd:
pairs = (line.split(None) for line in fd)
res = {int(pair[0]):pair[1] for pair in pairs if len(pair) == 2 and pair[0].isdigit()}
#attempt 2
import json
# reading the JSON data using json.load()
file = 'data.json'
with open(file) as train_file:
dict_train = json.load(train_file)
# converting json dataset from dictionary to dataframe
train = pd.DataFrame.from_dict(dict_train, orient='index')
train.reset_index(level=0, inplace=True)
#attempt 3
df = pd.read_csv("data.csv")
df = df.melt(id_vars=["index", "Date"], var_name="variables",value_name="values")
Nothening works due the data be weird shaped
Expected output:
All the items inside of the dictionary, every key will be one column at df
Date year casa status Group
2020-05-02 11:06:17 2010 aca p 57981
2020-05-02 11:06:17 27 Na p 57981

Format data into a valid csv stucture:
id,client,source,status,request,response,queued,created_at,updated_at
54252,sdf,https://asdasdadadad,,'{ "ag": "2010", "ca": "aca", "ve": "p", "Group": "57981" }',,1,"2020-05-02 11:06:17","2020-05-02 11:06:17"
54252,msc-lp,https://discover,,'{ "ag": "27", "ca": "Na", "ve": "p", "Group": "57981" }',,1,"2020-05-02 11:06:17","2020-05-02 11:06:17"
This should work for the worst-case scenario as well,
check it out.
import json
import pandas as pd
def parse_column(data):
try:
return json.loads(data)
except Exception as e:
print(e)
return None
df =pd.read_csv('tmp.csv',converters={"request":parse_column}, quotechar="'")

Related

Save List of object with heterogenous keys in csv file in python

I need to store list of objects whose number of keys are not fixed in CSV file.
Example :
suppose I have a object
obj1 = {
"name": "abc",
"class" : "10th"
}
obje2 = {
"name": "abcd",
"class" : "11th",
"div" : "A"
}
So There will be common elements, but I need to store all the elements
My output would be like this :
Sure thing. You'll just have to gather up all of the keys first – that's done with set unions – and then feed the data to a csv.DictWriter.
import csv
import sys
data = [
{"name": "abc", "class": "10th"},
{"name": "abcd", "class": "11th", "div": "A"},
{"name": "qweabcd", "class": "11th", "diy": "Q"},
]
all_keys = set()
for obj in data:
all_keys |= set(obj)
writer = csv.DictWriter(sys.stdout, sorted(all_keys))
writer.writeheader()
for obj in data:
writer.writerow(obj)
This outputs
class,div,diy,name
10th,,,abc
11th,A,,abcd
11th,,Q,qweabcd
Since sets don't have an intrinsic order, I'm using sorted to sort the keys alphabetically.
If you need a specific order, that can be done with a custom key function to sorted.
key_order = ["name", "class"]
def get_key_sort_value(key):
if key in key_order:
return (0, key_order.index(key))
return (1, key)
writer = csv.DictWriter(sys.stdout, sorted(all_keys, key=get_key_sort_value))
# ...
will sort name and class first, followed by other keys in alphabetical order.
Another solution, with pandas:
import pandas as pd
obj1 = {"name": "abc", "class": "10th"}
obj2 = {"name": "abcd", "class": "11th", "div": "A"}
df = pd.DataFrame([obj1, obj2])
print(df)
df.to_csv("data.csv", index=False)
Prints:
name class div
0 abc 10th NaN
1 abcd 11th A
and saves data.csv (screenshot from LibreOffice):

Excel to JSON format with python

I have an excel sheet which is in the below format
I want to convert this excel sheet into JSON format using Python. each JSON object is a diagonal value and column headings in the below format.
{
"Records": [
{
"RecordId": "F1",
"Assets": [
{
"AssetId": "A1",
"Support": "S11"
},
{
"AssetId": "A2",
"Support": "S12"
},
{
"AssetId": "A3",
"Support": "S13"
}
]
},
{
"RecordId": "F2",
"Assets": [
{
"AssetId": "A1",
"Support": "S21"
},
{
"AssetId": "A2",
"Support": "S22"
},
{
"AssetId": "A3",
"Support": "S23"
}
]
}
]
}
I have written some code it seems not working as I expected.
import json
import pandas as pd
df = pd.read_excel (r'test.xlsx', sheet_name='Sheet2')
#initialize data
data=[0 for i in range(len(df))]
datac=[0 for c in range(len(df.columns))]
newset=dict()
for i in range(len(df)):
# data[i] = r'{"'+str(df.columns.values[0])+'": "' +str(df.loc[i][0])+'", '+str(df.columns.values[1])+'": "' +str(df.loc[i][1])+'", '+str(df.columns.values[2])+'": "' +str(df.loc[i][2])+'"}'
#data[i] = {str(df.columns.values[1]) : str(df.loc[i][0]), str(df.columns.values[1]): str(df.loc[i][1]), str(df.columns.values[2]): str(df.loc[i][2])}
for c in range(1,len(df.columns)):
#data[i] = {str('RecordId') : str(df.loc[i][0]),str('Assets'):[{"AssetId": str(df.columns.values[c]),"Support": str(df.loc[i][c])}]}
datac[c] = {"AssetId": str(df.columns.values[c]),"Support": str(df.loc[i][c])}
data[i]={str('RecordId') : str(df.loc[i][0]),str('Assets'):datac[c]}
print(data[i])
output_lines = [json.dumps(line)+",\n" for line in data]
output_lines[-1] = output_lines[-1][:-2] # remove ",\n" from last line
with open(r'Savedwork.json', 'w') as json_file:
json_file.writelines(output_lines)
What you need is the iterrows() method, it will iterate over the
dataframe's rows as (index, series) pairs. The columns() method will give you
the list of column names, so you'll be able to iterate over the columns in the
series, and access them by name.
import json
import pandas as pd
df = pd.read_excel('test.xlsx')
recs = []
for i, row in df.iterrows():
rec = {
'RecordId': row[0],
'Assets': [{'AssetId': c, 'Support': row[c]} for c in df.columns[1:]]
}
recs.append(rec)
out = {'Records': recs}
(yes, it could all be done in a single list comprehension, but abusing those hinders readability)
Also, you don't need to do json.dumps on lines, and then assemble them with
newlines (don't work at the text level): build a dictionary with the entire
data, and then json.dump that:
print(json.dumps(out, indent=4))
You can create the dicts directly in pandas.
First set the first column with F1, F2 as index:
df.set_index(0, inplace = True)
df.index.name = None
Then create the dicts in pandas with dict keys as column names, export it to a dict and save it to json:
import json
df = df.apply(lambda x: [{"AssetId": x.name, "Support": i} for i in x], axis =1).reset_index().rename(columns={'index': 'RecordId', 0: 'Assets'})
json_data = {"Records": df.to_dict('records')}
with open('r'Savedwork.json', 'w') as fp:
json.dump(json_data, fp)
another solution is to take a snapshot of the entire workbook in json format and reorganize it out of the box. Using the collect function of XLtoy is possible to do that via command line, this approach allows you more degrees of freedom.
[i'm the main developer of XLtoy]

To retrieve specific data from multiple similar portions in a .json file

A part of json file's content as below:
{"ID": "PK45", "People": "Kate", "Date": "2020-01-05"}, {"ID": "OI85", "People": "John", "Date": "2020-01-18" }, {"ID": "CN658", "People": "Pevo", "Date": "2020-02-01" }
It has multiple portions containing "ID", "People" and "Date".
What I want to do is to retrieve John's ID (in this case "OI85". )
If the key is unique, I can use:
data_content = json.loads(data)
ID = data_content['ID']
But there are multiple similar portions. So I can only locate the "John" first:
with open("C:\\the_file.json") as data_file:
data = data_file.read()
where = data.find('John')
where_1 = data[where - 20 : where]
ID = where_1[where_1.find('ID')+3:where_1.find('ID')+7]
print (ID)
It looked clumsy.
What will be the smart json way to retrieve the specific data from multiple similar portions in a .json file?
Thank you.
Iterate on the list of dicts until you find the right one:
import json
data = '[{"ID": "PK45", "People": "Kate", "Date": "2020-01-05"}, {"ID": "OI85", "People": "John", "Date": "2020-01-18" }, {"ID": "CN658", "People": "Pevo", "Date": "2020-02-01" }]'
data_content = json.loads(data)
def find_id_by_name(name, data):
for d in data:
if d['People'] == name:
return d["ID"]
else:
raise ValueError('Name not found')
print(find_id_by_name('John', data_content))
# OI85
print(find_id_by_name('Jane', data_content))
# ... ValueError: Name not found
If you have to do many such searches, it may be worth creating another dict from your data to associate IDs to names:
ids_by_name = {d['People']:d['ID'] for d in data_content}
print(ids_by_name['John'])
# OI85
You probably should use the json module, which makes the task trivial:
import json
with open('data.json') as f:
records = json.load(f)
for record in records:
if record['People'] == 'John':
print(record['ID'])
break

Parse JSON output from API file into CSV

I am currently trying to convert a JSON output from an API request to a CSV format so i can store the results into our database. Here is my current code for reference:
import pyodbc
import csv
#import urllib2
import json
import collections
import requests
#import pprint
#import functools
print ("Connecting via ODBC")
conn = pyodbc.connect('DSN=DSN', autocommit=True)
print ("Connected!\n")
cur = conn.cursor()
sql = """SELECT DATA"""
cur.execute(sql)
#df = pandas.read_sql_query(sql, conn)
#df.to_csv('TEST.csv')
#print('CSV sheet is ready to go!')
rows = cur.fetchall()
obs_list = []
for row in rows:
d = collections.OrderedDict()
d['addressee'] = row.NAME
d['street'] = row.ADDRESS
d['city'] = row.CITY
d['state'] = row.STATE
d['zipcode'] = row.ZIP
obs_list.append(d)
obs_file = 'TEST.json'
with open(obs_file, 'w') as file:
json.dump(obs_list, file)
print('Run through API')
url = 'https://api.smartystreets.com/street-address?'
headers = {'content-type': 'application/json'}
with open('test1.json', 'r') as run:
dict_run = run.readlines()
dict_ready = (''.join(dict_run))
r = requests.post(url , data=dict_ready, headers=headers)
ss_output = r.text
output = 'output.json'
with open(output,'w') as of:
json.dump(ss_output, of)
print('I think it works')
f = open('output.json')
data = json.load(f)
data_1 = data['analysis']
data_2 = data['metadata']
data_3 = data['components']
entity_data = open('TEST.csv','w')
csvwriter = csv.writer(entity_data)
count = 0
count2 = 0
count3 = 0
for ent in data_1:
if count == 0:
header = ent.keys()
csvwriter.writerow(header)
count += 1
csvwriter.writerow(ent.values())
for ent_2 in data_2:
if count2 == 0:
header2 = ent_2.keys()
csvwriter.writerow(header2)
count2 += 1
csvwriter.writerow(ent_2.values())
for ent_3 in data_3:
if count3 == 0:
header3 = ent_3.keys()
csvwriter.writerow(header3)
count3 += 1
csvwriter.writerow(ent_3.values())
entity_data.close()
Sample output from API:
[
{
"input_index": 0,
"candidate_index": 0,
"delivery_line_1": "1 Santa Claus Ln",
"last_line": "North Pole AK 99705-9901",
"delivery_point_barcode": "997059901010",
"components": {
"primary_number": "1",
"street_name": "Santa Claus",
"street_suffix": "Ln",
"city_name": "North Pole",
"state_abbreviation": "AK",
"zipcode": "99705",
"plus4_code": "9901",
"delivery_point": "01",
"delivery_point_check_digit": "0"
},
"metadata": {
"record_type": "S",
"zip_type": "Standard",
"county_fips": "02090",
"county_name": "Fairbanks North Star",
"carrier_route": "C004",
"congressional_district": "AL",
"rdi": "Commercial",
"elot_sequence": "0001",
"elot_sort": "A",
"latitude": 64.75233,
"longitude": -147.35297,
"precision": "Zip8",
"time_zone": "Alaska",
"utc_offset": -9,
"dst": true
},
"analysis": {
"dpv_match_code": "Y",
"dpv_footnotes": "AABB",
"dpv_cmra": "N",
"dpv_vacant": "N",
"active": "Y",
"footnotes": "L#"
}
},
{
"input_index": 1,
"candidate_index": 0,
"delivery_line_1": "Loop land 1",
"last_line": "North Pole AK 99705-9901",
"delivery_point_barcode": "997059901010",
"components": {
"primary_number": "1",
"street_name": "Lala land",
"street_suffix": "Ln",
"city_name": "North Pole",
"state_abbreviation": "AK",
"zipcode": "99705",
"plus4_code": "9901",
"delivery_point": "01",
"delivery_point_check_digit": "0"
},
"metadata": {
"record_type": "S",
"zip_type": "Standard",
"county_fips": "02090",
"county_name": "Fairbanks North Star",
"carrier_route": "C004",
"congressional_district": "AL",
"rdi": "Commercial",
"elot_sequence": "0001",
"elot_sort": "A",
"latitude": 64.75233,
"longitude": -147.35297,
"precision": "Zip8",
"time_zone": "Alaska",
"utc_offset": -9,
"dst": true
},
"analysis": {
"dpv_match_code": "Y",
"dpv_footnotes": "AABB",
"dpv_cmra": "N",
"dpv_vacant": "N",
"active": "Y",
"footnotes": "L#"
}
]
After storing the API output the trouble is trying to parse the returned output (Sample output) into a CSV format. The code im using to try to do this:
f = open('output.json')
data = json.load(f)
data_1 = data['analysis']
data_2 = data['metadata']
data_3 = data['components']
entity_data = open('TEST.csv','w')
csvwriter = csv.writer(entity_data)
count = 0
count2 = 0
count3 = 0
for ent in data_1:
if count == 0:
header = ent.keys()
csvwriter.writerow(header)
count += 1
csvwriter.writerow(ent.values())
for ent_2 in data_2:
if count2 == 0:
header2 = ent_2.keys()
csvwriter.writerow(header2)
count2 += 1
csvwriter.writerow(ent_2.values())
for ent_3 in data_3:
if count3 == 0:
header3 = ent_3.keys()
csvwriter.writerow(header3)
count3 += 1
csvwriter.writerow(ent_3.values())
entity_data.close()
returns the following error: TypeError: string indices must be integers. And as someone kindly commented and pointed out it appears i am iterating over keys instead of the different dictionaries, and this is where I get stuck because im not sure what to do? From my understanding it looks like the JSON is split into 3 different arrays with JSON object for each, but that does not appear to be the case according to the structure? I apologize for the length of the code, but I want some resemblance of context to what i am trying to accomplish.
Consider pandas's json_normalize() method to flatten nested items into tabular df structure:
import pandas as pd
from pandas.io.json import json_normalize
import json
with open('Output.json') as f:
data = json.load(f)
df = json_normalize(data)
df.to_csv('Output.csv')
Do note the components, metadata, and analysis become period-separated prefixes to corresponding values. If not needed, consider renaming columns.
You are saving request's result.text with json. result.text is a string so upon rereading it through json you get the same one long string instead of a list. Try to write result.text as is:
output = 'output.json'
with open(output,'w') as of:
of.write(ss_output)
That's the cause of TypeError:string indices must be integers you mention.
The rest of your code has multiple issues.
The data in json is a list of dicts so to get ,say , data_1 you need list comprehension like this:
data_1 = [x['analysis'] for x in data]
You write three types of rows into the same csv file: components, metadata and analyzis. That's really odd.
Probably you have to rewrite the second half of the code: open three csv_writers one per data type, then iterate over data items and write their fields into corresponding csv_writer.

Write json format using pandas Series and DataFrame

I'm working with csvfiles. My goal is to write a json format with csvfile information. Especifically, I want to get a similar format as miserables.json
Example:
{"source": "Napoleon", "target": "Myriel", "value": 1},
According with the information I have the format would be:
[
{
"source": "Germany",
"target": "Mexico",
"value": 1
},
{
"source": "Germany",
"target": "USA",
"value": 2
},
{
"source": "Brazil",
"target": "Argentina",
"value": 3
}
]
However, with the code I used the output looks as follow:
[
{
"source": "Germany",
"target": "Mexico",
"value": 1
},
{
"source": null,
"target": "USA",
"value": 2
}
][
{
"source": "Brazil",
"target": "Argentina",
"value": 3
}
]
Null source must be Germany. This is one of the main problems, because there are more cities with that issue. Besides this, the information is correct. I just want to remove several list inside the format and replace null to correct country.
This is the code I used using pandas and collections.
csvdata = pandas.read_csv('file.csv', low_memory=False, encoding='latin-1')
countries = csvdata['country'].tolist()
newcountries = list(set(countries))
for element in newcountries:
bills = csvdata['target'][csvdata['country'] == element]
frquency = Counter(bills)
sourceTemp = []
value = []
country = element
for k,v in frquency.items():
sourceTemp.append(k)
value.append(int(v))
forceData = {'source': Series(country), 'target': Series(sourceTemp), 'value': Series(value)}
dfForce = DataFrame(forceData)
jsondata = dfForce.to_json(orient='records', force_ascii=False, default_handler=callable)
parsed = json.loads(jsondata)
newData = json.dumps(parsed, indent=4, ensure_ascii=False, sort_keys=True)
# since to_json doesn´t have append mode this will be written in txt file
savetxt = open('data.txt', 'a')
savetxt.write(newData)
savetxt.close()
Any suggestion to solve this problem are appreciate!
Thanks
Consider removing the Series() around the scalar value, country. By doing so and then upsizing the dictionaries of series into a dataframe, you force NaN (later converted to null in json) into the series to match the lengths of other series. You can see this by printing out the dfForce dataframe:
from pandas import Series
from pandas import DataFrame
country = 'Germany'
sourceTemp = ['Mexico', 'USA', 'Argentina']
value = [1, 2, 3]
forceData = {'source': Series(country),
'target': Series(sourceTemp),
'value': Series(value)}
dfForce = DataFrame(forceData)
# source target value
# 0 Germany Mexico 1
# 1 NaN USA 2
# 2 NaN Argentina 3
To resolve, simply keep country as scalar in dictionary of series:
forceData = {'source': country,
'target': Series(sourceTemp),
'value': Series(value)}
dfForce = DataFrame(forceData)
# source target value
# 0 Germany Mexico 1
# 1 Germany USA 2
# 2 Germany Argentina 3
By the way, you do not need a dataframe object to output to json. Simply use a list of dictionaries. Consider the following using an Ordered Dictionary collection (to maintain the order of keys). In this way the growing list dumps into a text file without appending which would render an invalid json as opposite facing adjacent square brackets ...][... are not allowed.
from collections import OrderedDict
...
data = []
for element in newcountries:
bills = csvdata['target'][csvdata['country'] == element]
frquency = Counter(bills)
for k,v in frquency.items():
inner = OrderedDict()
inner['source'] = element
inner['target'] = k
inner['value'] = int(v)
data.append(inner)
newData = json.dumps(data, indent=4)
with open('data.json', 'w') as savetxt:
savetxt.write(newData)

Categories

Resources