how can i display json array to python dataframe - python

I have a json file.
[
{
'orderId': 1811,
'deliveryId': '000001811-1634732661563000',
'shippingBook': '[{"qtyOrdered":1,"bookNoList":["B8303-V05","B8304-V05","B8305-V05","B8306-V05","B8307-V05"],"courseCode":"A8399-S26"},{"courseCode":"A1399-S70","qtyOrdered":1,"bookNoList":["B1301-V06","B1302-V06","B1303-V06","B1304-V06","B1305-1-V06","B1305-2-V06","B1306-V06","B1307-V06"]}]',
}
]
but how can i display in dataframe in format
thank you

You have string in 'shippingBook' which may need json.loads() to convert it to Python's list with dictionaries.
And you could use normal for-loops to convert all data to normal list with expected data - and later convert it to DataFrame
import json
import pandas as pd
data = [
{
'orderId': 1811,
'deliveryId': '000001811-1634732661563000',
'shippingBook': '[{"qtyOrdered":1,"bookNoList":["B8303-V05","B8304-V05","B8305-V05","B8306-V05","B8307-V05"],"courseCode":"A8399-S26"},{"courseCode":"A1399-S70","qtyOrdered":1,"bookNoList":["B1301-V06","B1302-V06","B1303-V06","B1304-V06","B1305-1-V06","B1305-2-V06","B1306-V06","B1307-V06"]}]',
}
]
# --- organize data ---
all_rows = []
for order in data:
order_id = order['orderId']
delivery_id = order['deliveryId']
for book in json.loads(order['shippingBook']):
row = [order_id, delivery_id, book['courseCode'], book['bookNoList']]
#print(row)
all_rows.append(row)
# --- convert to DataFrame ---
df = pd.DataFrame(all_rows, columns=['orderId', 'deliveryId', 'courseCode', 'bookNoList'])
print(df.to_string()) # `to_string()` to display all data without `...`
Result:
orderId deliveryId courseCode bookNoList
0 1811 000001811-1634732661563000 A8399-S26 [B8303-V05, B8304-V05, B8305-V05, B8306-V05, B8307-V05]
1 1811 000001811-1634732661563000 A1399-S70 [B1301-V06, B1302-V06, B1303-V06, B1304-V06, B1305-1-V06, B1305-2-V06, B1306-V06, B1307-V06]
EDIT:
You may also try do the same directly in DataFrame.
It needs explode to split list into rows
import json
import pandas as pd
data = [
{
'orderId': 1811,
'deliveryId': '000001811-1634732661563000',
'shippingBook': '[{"qtyOrdered":1,"bookNoList":["B8303-V05","B8304-V05","B8305-V05","B8306-V05","B8307-V05"],"courseCode":"A8399-S26"},{"courseCode":"A1399-S70","qtyOrdered":1,"bookNoList":["B1301-V06","B1302-V06","B1303-V06","B1304-V06","B1305-1-V06","B1305-2-V06","B1306-V06","B1307-V06"]}]',
}
]
#df = pd.DataFrame.from_records(data)
df = pd.DataFrame(data)
# convert string to list with dictionares
df['shippingBook'] = df['shippingBook'].apply(json.loads)
# split list `'shippingBook'` into rows
df = df.explode('shippingBook')
df = df.reset_index()
del df['index']
# split elements into columns
#df['courseCode'] = df['shippingBook'].apply(lambda item:item['courseCode'])
#df['bookNoList'] = df['shippingBook'].apply(lambda item:item['bookNoList'])
df['courseCode'] = df['shippingBook'].str['courseCode'] # unexpected behaviour for string functions `.str`
df['bookNoList'] = df['shippingBook'].str['bookNoList'] # unexpected behaviour for string functions `.str`
# remove `'shippingBook'`
del df['shippingBook']
print(df.to_string())
And the same with apply(pd.Series) to convert list into columns.
import json
import pandas as pd
data = [
{
'orderId': 1811,
'deliveryId': '000001811-1634732661563000',
'shippingBook': '[{"qtyOrdered":1,"bookNoList":["B8303-V05","B8304-V05","B8305-V05","B8306-V05","B8307-V05"],"courseCode":"A8399-S26"},{"courseCode":"A1399-S70","qtyOrdered":1,"bookNoList":["B1301-V06","B1302-V06","B1303-V06","B1304-V06","B1305-1-V06","B1305-2-V06","B1306-V06","B1307-V06"]}]',
}
]
#df = pd.DataFrame.from_records(data)
df = pd.DataFrame(data)
# convert string to list with dictionares
df['shippingBook'] = df['shippingBook'].apply(json.loads)
# split list `'shippingBook'` into rows
df = df.explode('shippingBook')
df = df.reset_index()
del df['index']
# split elements into columns
new_columns = df['shippingBook'].apply(pd.Series)
#df[['qtyOrdered', 'bookNoList', 'courseCode']] = new_columns
#del df['qtyOrdered']
#df[['bookNoList', 'courseCode']] = new_columns[['bookNoList', 'courseCode']]
df = df.join(new_columns[['bookNoList', 'courseCode']])
# remove `'shippingBook'`
del df['shippingBook']
print(df.to_string())

Related

How to multipy values splited from str in DataFrame, Python?

For example DataFrame:
import pandas as pd
df = pd.DataFrame.from_dict({
'art1':['n1','n2'],
'sizes':['35 36 37', '36 38']
})
print (df)
# need that
df_result = pd.DataFrame.from_dict({
'art1':['n1','n1','n1','n2','n2'],
'sizes':[35,36,37,36,38]
})
print (df_result)
BELOW IS CORRECT BUT NOT EFFICIENT DECISION !!!
lst_art = []
lst_sizes = [x.split() for x in df['sizes']]
for i in range(len(lst_sizes)):
for j in range(len(lst_sizes[i])):
lst_art.append(df['art1'][i])
lst_sizes = sum(lst_sizes, [])
df = pd.DataFrame({'art1':lst_art, 'sizes':lst_sizes})
print (df)
any pandas efficient way to get df_result from df?
You can first split the string column into a list and then you can explode each item in the list into a new row
df = pd.DataFrame.from_dict({
'art1':['n1','n2'],
'sizes':['35 36 37', '36 38']
})
# convert str to list
df['sizes'] = df['sizes'].str.split()
# create one new row per item in list of `sizes`
df_result = df.explode('sizes')
or you can do an overly powerful one liner
df.assign(sizes=df['sizes'].str.split()).explode('sizes')

Too many columns resulting in `PerformanceWarning: DataFrame is highly fragmented`

I have a list of filepaths in the first column of a dataframe. My goal is to create a second column that represents file categories, with categories reflecting the words in the filepath.
import pandas as pd
import numpy as np
data = {'filepath': ['C:/barracuda/document.doc', 'C:/dog/document.doc', 'C:/cat/document.doc']
}
df = pd.DataFrame(data)
df["Animal"] =(df['filepath'].str.contains("dog|cat",case=False,regex=True))
df["Fish"] =(df['filepath'].str.contains("barracuda",case=False))
df = df.loc[:, 'filepath':'Fish'].replace(True, pd.Series(df.columns, df.columns))
df = df.loc[:, 'filepath':'Fish'].replace(False,np.nan)
def squeeze_nan(x):
original_columns = x.index.tolist()
squeezed = x.dropna()
squeezed.index = [original_columns[n] for n in range(squeezed.count())]
return squeezed.reindex(original_columns, fill_value=np.nan)
df = df.apply(squeeze_nan, axis=1)
print(df)
This code works. The problem arises when I have 200 statements beginning with df['columnName'] =. Because I have so many, I get the error:
PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling frame.insert many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use newframe = frame.copy()
To fix this I have tried:
dfAnimal = df.copy
dfAnimal['Animal'] = dfAnimal['filepath'].str.contains("dog|cat",case=False,regex=True)
dfFish = df.copy
dfFish["Fish"] =dfFish['filepath'].str.contains("barracuda",case=False)
df = pd.concat(dfAnimal,dfFish)
The above gives me errors such as method object is not iterable and method object is not subscriptable. I then tried df = df.loc[df['filepath'].isin(['cat','dog'])] but this only works when 'cat' or 'dog' is the only word in the column. How do I avoid the performance error?
Try creating all your new columns in a dict, and then convert that dict into a dataframe, and then use pd.concat to add the resulting dataframe (containing the new columns) to the original dataframe:
new_columns = {
'Animal': df['filepath'].str.contains("dog|cat",case=False,regex=True),
'Fish': df['filepath'].str.contains("barracuda",case=False),
}
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1)
Added to your original code, it would be something like this:
import pandas as pd
import numpy as np
data = {'filepath': ['C:/barracuda/document.doc', 'C:/dog/document.doc', 'C:/cat/document.doc']
}
df = pd.DataFrame(data)
##### These are the new lines #####
new_columns = {
'Animal': df['filepath'].str.contains("dog|cat",case=False,regex=True),
'Fish': df['filepath'].str.contains("barracuda",case=False),
}
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1)
##### End of new lines #####
df = df.loc[:, 'filepath':'Fish'].replace(True, pd.Series(df.columns, df.columns))
df = df.loc[:, 'filepath':'Fish'].replace(False,np.nan)
def squeeze_nan(x):
original_columns = x.index.tolist()
squeezed = x.dropna()
squeezed.index = [original_columns[n] for n in range(squeezed.count())]
return squeezed.reindex(original_columns, fill_value=np.nan)
df = df.apply(squeeze_nan, axis=1)
print(df)

Parse JSON in a Pandas DataFrame

I have some data in a pandas DataFrame, but one of the columns contains multi-line JSON. I am trying to parse that JSON out into a separate DataFrame along with the CustomerId. Here you will see my DataFrame...
df
Out[1]:
Id object
CustomerId object
CallInfo object
Within the CallInfo column, the data looks like this...
[{"CallDate":"2021-06-21","CallLength":362},{"CallDate":"2021-06-24","CallLength":402}]
I want to create a new DataFrame called df_norm which contains the CustomerId, CallDate, and CallLength.
I have tried several ways but couldn't find a working solution. Can anyone help me with this?
Mock up code example...
import pandas as pd
import json
Id = [1, 2, 3]
CustomerId = [700001, 700002, 700003]
CallInfo = ['[{"CallDate":"2021-06-21","CallLength":362},{"CallDate":"2021-06-24","CallLength":402}]', '[{"CallDate":"2021-07-09","CallLength":102}]', '[{"CallDate":"2021-07-11","CallLength":226},{"CallDate":"2021-07-11","CallLength":216}]']
# Reconstruct sample DataFrame
df = pd.DataFrame({
"Id": Id,
"CustomerId": CustomerId,
"CallInfo": CallInfo
})
print(df)
This should work. Create a new list of rows and then toss that into the pd.DataFrame constructor:
new_rows = [{
'Id': row['Id'],
'CustomerId': row['CustomerId'],
'CallDate': item['CallDate'],
'CallLength': item['CallLength']}
for _, row in df.iterrows() for item in json.loads(row['CallInfo'])]
new_df = pd.DataFrame(new_rows)
print(new_df)
EDIT: to account for None values in CallInfo column:
new_rows = []
for _, row in df.iterrows():
call_date = None
call_length = None
if row['CallInfo'] is not None: # Or additional checks, e.g. == "" or something...
for item in json.loads(row['CallInfo']):
call_date = item['CallDate']
call_length = item['CallLength']
new_rows.append({
'Id': row['Id'],
'CustomerId': row['CustomerId'],
'CallDate': call_date,
'CallLength': call_length})

How to read this JSON file in Python?

I'm trying to read such a JSON file in Python, to save only two of the values of each response part:
{
"responseHeader":{
"status":0,
"time":2,
"params":{
"q":"query",
"rows":"2",
"wt":"json"}},
"response":{"results":2,"start":0,"docs":[
{
"name":["Peter"],
"country":["England"],
"age":["23"]},
{
"name":["Harry"],
"country":["Wales"],
"age":["30"]}]
}}
For example, I want to put the name and the age in a table. I already tried it this way (based on this topic), but it's not working for me.
import json
import pandas as pd
file = open("myfile.json")
data = json.loads(file)
columns = [dct['name', 'age'] for dct in data['response']]
df = pd.DataFrame(data['response'], columns=columns)
print(df)
I also have seen more solutions of reading a JSON file, but that all were solutions of a JSON file with no other header values at the top, like responseHeader in this case. I don't know how to handle that. Anyone who can help me out?
import json
with open("myfile.json") as f:
columns = [(dic["name"],dic["age"]) for dic in json.load(f)["response"]["docs"]]
print(columns)
result:
[(['Peter'], ['23']), (['Harry'], ['30'])]
You can pass the list data["response"]["docs"] to pandas directly as it's a recordset.
df = pd.DataFrame(data["response"]["docs"])`
print(df)
>>> name country age
0 [Peter] [England] [23]
1 [Harry] [Wales] [30]
The data in you DatFrame will be bracketed though as you can see. If you want to remove the brackets you can consider the following:
for column in df.columns:
df.loc[:, column] = df.loc[:, column].str.get(0)
if column == 'age':
df.loc[:, column] = df.loc[:, column].astype(int)
sample = {"responseHeader":{
"status":0,
"time":2,
"params":{
"q":"query",
"rows":"2",
"wt":"json"}},
"response":{"results":2,"start":0,"docs":[
{
"name":["Peter"],
"country":["England"],
"age":["23"]},
{
"name":["Harry"],
"country":["Wales"],
"age":["30"]}]
}}
data = [(x['name'][0], x['age'][0]) for x in
sample['response']['docs']]
df = pd.DataFrame(names, columns=['name',
'age'])

Pandas-Python : How do you write new lines in Pandas?

I'm trying to save a list of JSON output from API's GET requests into CSV file using Pandas but below codes only generates single entry, it doesn't create new lines.
sample JSON output :
ID : 27980
Title : ELSVIOS 6 Colors Boho Split Long <font><b>Dress</b></font> Fashion Women O-Neck Maxi <font><b>Dress</b></font> Summer Short Sleeve Solid <font><b>Dress</b></font> With Belt Vestidos XS-3XL32815751265US
Price : $10.32US
Sale Price :$10.32
for resultsget in getlistproductsx:
producturls = resultsget['productTitle']
productids = resultsget['productId']
originalprices = resultsget['originalPrice']
saleprices = resultsget['salePrice']
print(producturls + str(productids) + originalprices + saleprices)
raw_data = {'product_title': [producturls],
'product_id': [productids],
'original_price': [originalprices],
'sale_price': [saleprices]}
df = pd.DataFrame(raw_data, columns = ['product_title', 'product_id', 'original_price', 'sale_price'])
df.to_csv('example2.csv')
As kosist said, you're overwriting your CSV File.
Create a second DataFrame to which you will append the data you imported in the loop.
import pandas as pd
cols = ['product_title', 'product_id', 'original_price', 'sale_price']
df = pd.DataFrame(columns=cols)
for resultsget in getlistproductsx:
producturls = resultsget['productTitle']
productids = resultsget['productId']
originalprices = resultsget['originalPrice']
saleprices = resultsget['salePrice']
print(producturls + str(productids) + originalprices + saleprices)
raw_data = {'product_title': [producturls],
'product_id': [productids],
'original_price': [originalprices],
'sale_price': [saleprices]}
# create second DataFrame to which the data is added
df2 = pd.DataFrame(raw_data, columns=cols)
# append the newly created DataFrame to the one keeping the data
df = df.append(df2)
# then write the DataFrame to csv
df.to_csv('csv.csv')
You probably want to load all your lines into a pandas DataFrame and after that do to_csv like:
import pandas as pd
df = pd.DataFrame(getlistproductsx)
df.to_csv('csv.csv')

Categories

Resources