Appending data from dataframe to JSON - python

I am attempting to generate a JSON output from a dataframe and I'm not sure why the values are not being assigned separately:
emails = []
if len(df) > 0:
for index, name in df.iterrows():
try:
value = str(df.names)
except:
continue
email = {
"names": value
}
emails.append(email)
print(json.dumps(emails, indent=4))
Output:
[
{
"names": "0 abby\n1 josh\n2 john\n3 heather\n4 justin\nName: value, dtype: object"
},
{
Desired output"
[
{
"names": abby
},
{
"names": josh
},
{
"names": john
}
]

Use pandas.DataFrame.to_json()
names = df[["names"]]
print(names.to_json(orient="records"))
Refer documentation

emails = []
if len(email_list) > 0:
for index, student in email_list.iterrows():
try:
students = int(ids,student_ids)
except:
continue
email = {
"StudentEmails": [student.email],
"student": students,
}
emails.append(email)
print(json.dumps(emails, indent=4))

You need to read the actual data(name.names) not the whole column(df.names)
try:
if len(df) > 0:
for index, name in df.iterrows():
try:
value = name.names
except:
continue
email = {
"names": value
}
emails.append(email)
print(json.dumps(emails, indent=4))

Instead of
value = str(df.names)
Use
value = list(df.names)
Series object are not serializable by default but list are serializable. For example this code:
df = pd.DataFrame({"a": [1, 2,4,5]})
json.dumps(list(df.a))
will output:
'[1, 2, 4, 5]'

Related

How to add duplicate columns together after converting from excel to json in python?

I have excel file in the format :
Name
Question
Answer
N1
Q1
a1
N2
Q2
a2
N3
Q3
a3
N4
Q4
a4
N3
Q5
a3
Here some name are same and their correspondings answers are also same. I want to convert this into json in the format where all the columns with same name are merged.
{
{
"name":"N1",
"exampleSentences": ["Q1"],
"defaultReply": {
"text": ["a1"],
"type": "text"
}
},
{
"name":"N2",
"exampleSentences": ["Q2"],
"defaultReply": {
"text": ["a2"],
"type": "text"
}
},
{
"name":"N3",
"exampleSentences": ["Q3","Q5"],
"defaultReply": {
"text": ["a3"],
"type": "text"
}
},
{
"name":"N4",
"exampleSentences": ["Q4"],
"defaultReply": {
"text": ["a4"],
"type": "text"
}
},
}
Here is the code that I wrote:
# Import the required python modules
import pandas as pd
import math
import json
import csv
# Define the name of the Excel file
fileName = "FAQ_eng"
# Read the Excel file
df = pd.read_excel("{}.xlsx".format(fileName))
intents = []
intentNames = df["Name"]
# Loop through the list of Names and create a new intent for each row
for index, name in enumerate(intentNames):
if name is not None:
exampleSentences = []
defaultReplies = []
if df["Question"][index] is not None and df["Question"][index] is not float:
try:
exampleSentences = df["Question"][index]
exampleSentences = [exampleSentences]
defaultReplies = df["Answer"][index]
defaultReplies = [defaultReplies]
except:
continue
intents.append({
"name": name,
"exampleSentences": exampleSentences,
"defaultReply": {
"text": defaultReplies,
"type": "text"
}
})
# Write the list of created intents into a JSON file
with open("{}.json".format(fileName), "w", encoding="utf-8") as outputFile:
json.dump(intents, outputFile, ensure_ascii=False)
My code adds another json data
{
"name":"N3",
"exampleSentences": ["Q5"],
"defaultReply": {
"text": ["a3"],
"type": "text"
}
instead of merging Q3 and Q5. What should I do?
The problem in your code is you are iterating through a set of items and at every iteration you should check the previous items to see if your current element is already present. You can avoid this problem if you use an initially empty dictionary d storing key, value pairs in the form d[name] = {"exampleSentences": [question], "text": [answer]}. You can iterate so over df["Name"] like below:
intentNames = df["Name"]
d = {}
# Loop through intentNames and create the dictionary
for index, name in enumerate(intentNames):
question = df["Question"][index]
answer = df["Answer"][index]
if name not in d:
d[name] = {"exampleSentences": [question], "text": [answer]}
else:
d[name]["exampleSentences"].append(question)
Then you can use the created dictionary to create the json file with the expected output like below:
intentNames = df["Name"]
d = {}
# Loop through intentNames and create the dictionary
for index, name in enumerate(intentNames):
question = df["Question"][index]
answer = df["Answer"][index]
if name not in d:
d[name] = {"exampleSentences": [question], "text": [answer]}
else:
d[name]["exampleSentences"].append(question)
#create the json array file
intents = []
for k, v in d.items():
intents.append({
"name": k,
"exampleSentences": v['exampleSentences'],
"defaultReply": {
"text": v['text'],
"type": "text"
}
})
# Write the list of created intents into a JSON file
with open("{}.json".format(fileName), "w", encoding="utf-8") as outputFile:
json.dump(intents, outputFile, ensure_ascii=False)

Writing a List of Dictionaries to seperate JSONs

I do have dictionary, with each value as a list.
I want to write individual items to separate JSON files.
For example
data_to_write = {"Names":["name1", "name2", "name3"], "email":["mail1", "mail2", "mail3"]}
Now I want 3 jsons i.e data1.jsob, data2.json, data3.json in the following(approx) format.
data1.json
{
Name: name1,
email: mail1
}
data2.json
{
Name: name2,
email: mail2
}
and so on.
My current approach is
for file_no in range(no_of_files):
for count, (key, info_list) in enumerate(data_to_write.items()):
for info in info_list:
with open(
os.path.join(self.path_to_output_dir, str(file_no)) + ".json",
"a",
) as resume:
json.dump({key: info}, resume)
But this is wrong. Any helps appreciated.
You could use pandas to do the work for you. Read the dictionary into a dataframe, then iterate the rows of the dataframe to produce the json for each row:
import pandas as pd
data_to_write = {"Names":["name1", "name2", "name3"], "email":["mail1", "mail2", "mail3"]}
df = pd.DataFrame(data_to_write).rename(columns={'Names':'Name'})
for i in range(len(df)):
jstr = df.iloc[i].to_json()
with open(f"data{i+1}.json", "w") as f:
f.write(jstr)
Output (each line is in a separate file):
{"Name":"name1","email":"mail1"}
{"Name":"name2","email":"mail2"}
{"Name":"name3","email":"mail3"}
Try:
import json
data_to_write = {
"Names": ["name1", "name2", "name3"],
"email": ["mail1", "mail2", "mail3"],
}
for i, val in enumerate(zip(*data_to_write.values()), 1):
d = dict(zip(data_to_write, val))
with open(f"data{i}.json", "w") as f_out:
json.dump(d, f_out, indent=4)
This writes data(1..3).json with content:
# data1.json
{
"Names": "name1",
"email": "mail1"
}
# data2.json
{
"Names": "name2",
"email": "mail2"
}
...
import json
data_to_write = {
"Names": ["name1", "name2", "name3"],
"email": ["mail1", "mail2", "mail3"],
}
for ind, val in enumerate(zip(*data_to_write.values())):
jsn = dict(zip(data_to_write, val))
print(jsn)
with open("data{}.json".format(ind), "w") as f:
f.write(json.dumps(jsn))

I want to change value in json array object based on index number using pyspark

I want to change value in json array object based on index number using pyspark, then will use columnName to update dataframe column names:
input:
jsonArray = [
{
"index": 1,
"columnName":"Names"
},
{
"index": 2,
"columnName":"City"
}
]
output:
jsonArray = [
{
"index": 1,
"columnName":"titles"
},
{
"index": 2,
"columnName":"countries"
}
]
function header:
def renameColumn(index, newName, df):
return df_with_new_column_names
If I understood your requirement correctly, try something as below-
for i in range(len(jsonArray)):
if jsonArray[i]['index']==1:
jsonArray[i]['columnName'] = "titles"
else:
jsonArray[i]['columnName'] = "countries"
print(jsonArray)
Output -
[{'index': 1, 'columnName': 'titles'}, {'index': 2, 'columnName': 'countries'}]
def jsonColumnName(jsonArray, indx, newName):
for jsonObj in jsonArray:
if jsonObj['index'] == indx:
jsonObj['Field_Name'] = newName
return jsonArray

Python: building complex nested lists within a dictionary

I am looking at building lists of lists within a dictionary from an Excel spreadsheet.
My spreadsheet looks like this:
source_item_id
target_item_id
find_sting
replace_sting
source_id1
target_id1
abcd1
efgh1
source_id1
target_id1
ijkl1
mnop1
source_id1
target_id2
abcd2
efgh2
source_id1
target_id2
ijkl2
mnop2
source_id2
target_id3
qrst
uvwx
source_id2
target_id3
yzab
cdef
source_id2
target_id4
ghij
klmn
source_id2
target_id4
opqr
stuv
My output dictionary should looks like this:
{
"source_id1": [{
"target_id1": [{
"find_string": "abcd1",
"replace_string": "efgh1"
},
{
"find_string": "ijkl1",
"replace_string": "mnop1"
}]
},
{
"target_id2": [{
"find_string": "abcd2",
"replace_string": "efgh2"
},
{
"find_string": "ijkl2",
"replace_string": "mnop2"
}]
}],
"source_id2": [{
"target_id3": [{
"find_string": "qrst",
"replace_string": "uvwx"
},
{
"find_string": "yzab",
"replace_string": "cdef"
}]
},
{
"target_id4": [{
"find_string": "ghij",
"replace_string": "klmn"
},
{
"find_string": "opqr",
"replace_string": "stuv"
}]
}]
}
With the following code I only get the last values in each of the lists:
import xlrd
xls_path = r"C:\data\ItemContent.xlsx"
book = xlrd.open_workbook(xls_path)
sheet_find_replace = book.sheet_by_index(1)
find_replace_dict = dict()
for line in range(1, sheet_find_replace.nrows):
source_item_id = sheet_find_replace.cell(line, 0).value
target_item_id = sheet_find_replace.cell(line, 1).value
find_string = sheet_find_replace.cell(line, 2).value
replace_sting = sheet_find_replace.cell(line, 3).value
find_replace_list = [{"find_string": find_string, "replace_sting": replace_sting}]
find_replace_dict[source_item_id] = [target_item_id]
find_replace_dict[source_item_id].append(find_replace_list)
print(find_replace_dict)
--> result
{
"source_id1": ["target_id2", [{
"find_string": "ijkl2",
"replace_sting": "mnop2"
}
]],
"source_id2": ["target_id4", [{
"find_string": "opqr",
"replace_sting": "stuv"
}
]]
}
Your problem is rather complicated by the fact that you have a list of single-key dictionaries as the value of your source ids, but you can follow a pattern of parsing each line for the relevant items and, and then using those to target where you insert appends, or alternatively create new lists:
def process_line(line) -> Tuple[str, str, dict]:
source_item_id = sheet_find_replace.cell(line, 0).value
target_item_id = sheet_find_replace.cell(line, 1).value
find_string = sheet_find_replace.cell(line, 2).value
replace_string = sheet_find_replace.cell(line, 3).value
return source_item_id, target_item_id, {
"find_string": find_string,
"replace_string": replace_string
}
def find_target(target: str, ls: List[dict]) -> int:
# Find the index of the target id in the list
for i in len(ls):
if ls[i].get(target):
return i
return -1 # Or some other marker
import xlrd
xls_path = r"C:\data\ItemContent.xlsx"
book = xlrd.open_workbook(xls_path)
sheet_find_replace = book.sheet_by_index(1)
result_dict = dict()
for line in range(1, sheet_find_replace.nrows):
source, target, replacer = process_line(line)
# You can check here that the above three are correct
source_list = result_dict.get(source, []) # Leverage the default value of the get function
target_idx = find_target(target, source_list)
target_dict = source_list[target_idx] if target_idx >=0 else {}
replace_list = target_dict.get(target, [])
replace_list.append(replacer)
target_dict[target] = replace_list
if target_idx >= 0:
source_list[target_idx] = target_dict
else:
source_list.append(target_dict)
result_dict[source] = source_list
print(result_dict)
I would note that if source_id pointed to a dictionary rather than a list, this could be radically simplified, since we wouldn't need to search through the list for a potentially already-existing list item and then awkwardly replace or append as needed. If you can change this constraint (remember, you can always convert a dictionary to a list downstream), I might consider doing that.

Converting the CSV file to specified Json format

I am new to Python and don't know how to achieve this. I am trying to convert CSV file to JSON format. Address have types 1. Primary 2. Work and Address is multi value attribute as well. Person can have 2 Primary address.
Input Data in CSV format
"f_name"|"l_name"|"address_type"|"address_line_1"|"city"|"state"|"postal_code"|"country"
Brad|Pitt|Primary|"18 Atherton"|Irvine|CA|"92620-2501"|USA
Brad|Pitt|work|"1325 S Grand Ave"|Santa Ana|CA|"92705-4406"|USA
Output Expecting in JSON Format
{
"f_name": "Brad",
"l_name": "Pitt",
"parsed_address": [
{
"address_type": "Primary",
"address": [
{
"address_line_1": "18 Atherton",
"city": "Irvine",
"state": "CA",
"postal_code": "92620-2501",
"country": "USA"
}
]
},
{
"address_type": "work",
"address": [
{
"address_line_1": "1325 S Grand Ave",
"city": "Santa Ana",
"state": "CA",
"postal_code": "92620-2501",
"country": "USA"
}
]
}
]
}
Code Tried
df = pd.read_csv("file")
g_cols = ['f_name','l_name']
address_field = ['address']
cols = ['address_line_1', 'address_line_2', 'address_line_3', 'city', 'state', 'postal_code', 'country']
for i in g_cols:
if i in dict_val.keys():
g_cols[g_cols.index(i)] = dict_val[i]
for i in cols:
if i in dict_val.keys():
cols[cols.index(i)] = dict_val[i]
df2 = df.drop_duplicates().groupby(g_cols)[cols].apply(lambda x: x.to_dict('records')).reset_index(
name=address_field).to_dict('record')
You were close. This should do too exactly what aim to do.
df = pd.read_csv("data.csv", sep="|")
df
dic = {}
for name, group in df.groupby(by=["name"]):
dic["name"] = name
dic["parsed_address"] = []
for address_type, group in df.groupby(by=["address_type"]):
address_dic = {}
address_dic["address_type"] = address_type
address_dic["address"] = group.drop(columns=["name", "address_type"]).to_dict(orient="records")
dic["parsed_address"].append(address_dic)
dic
I think you can try having a dictionary or list (json_data in the code below) to keep track of a person's data and iterating throw each row of the dataframe using for _, row in df.iterrows():
import pandas as pd
df = pd.read_csv("file", delimiter='|')
print(df)
json_data = {}
for _, row in df.iterrows():
name = row["name"]
address_type = row["address_type"]
address_line_1 = row["address_line_1"]
city = row["city"]
state = row["state"]
postal_code = row["postal_code"]
country = row["country"]
if name not in json_data:
json_data[name] = {
"name": name,
"parsed_address": []
}
address_list = None
for address in json_data[name]["parsed_address"]:
if address["address_type"] == address_type:
address_list = address
if address_list is None:
address_list = {
"address_type": address_type,
"address": []
}
json_data[name]["parsed_address"].append(address_list)
address_list["address"].append({
"address_line_1": address_line_1,
"city": city,
"state": state,
"postal_code": postal_code,
"country": country
})
lst = list(json_data.values())
# Verify data parsing
import json
print(json.dumps(lst, indent=2))
dic = {}
g_cols = ['id','first_name','last_name','address_type]
for name, group in df.groupby(g_cols)["address"]:
id = name[0]
dic["id"] = id
dic["parsed_address"] = []
for address_type, group in df.groupby(by=["address_type"]):
address_dic = {}
address_dic["address_type"] = address_type
address_dic["address"] = group.drop(
columns=["id", "first_name","last_name","address_type"]).to_dict("record")
dic["parsed_address"].append(address_dic)

Categories

Resources