Converting the CSV file to specified Json format - python

I am new to Python and don't know how to achieve this. I am trying to convert CSV file to JSON format. Address have types 1. Primary 2. Work and Address is multi value attribute as well. Person can have 2 Primary address.
Input Data in CSV format
"f_name"|"l_name"|"address_type"|"address_line_1"|"city"|"state"|"postal_code"|"country"
Brad|Pitt|Primary|"18 Atherton"|Irvine|CA|"92620-2501"|USA
Brad|Pitt|work|"1325 S Grand Ave"|Santa Ana|CA|"92705-4406"|USA
Output Expecting in JSON Format
{
"f_name": "Brad",
"l_name": "Pitt",
"parsed_address": [
{
"address_type": "Primary",
"address": [
{
"address_line_1": "18 Atherton",
"city": "Irvine",
"state": "CA",
"postal_code": "92620-2501",
"country": "USA"
}
]
},
{
"address_type": "work",
"address": [
{
"address_line_1": "1325 S Grand Ave",
"city": "Santa Ana",
"state": "CA",
"postal_code": "92620-2501",
"country": "USA"
}
]
}
]
}
Code Tried
df = pd.read_csv("file")
g_cols = ['f_name','l_name']
address_field = ['address']
cols = ['address_line_1', 'address_line_2', 'address_line_3', 'city', 'state', 'postal_code', 'country']
for i in g_cols:
if i in dict_val.keys():
g_cols[g_cols.index(i)] = dict_val[i]
for i in cols:
if i in dict_val.keys():
cols[cols.index(i)] = dict_val[i]
df2 = df.drop_duplicates().groupby(g_cols)[cols].apply(lambda x: x.to_dict('records')).reset_index(
name=address_field).to_dict('record')

You were close. This should do too exactly what aim to do.
df = pd.read_csv("data.csv", sep="|")
df
dic = {}
for name, group in df.groupby(by=["name"]):
dic["name"] = name
dic["parsed_address"] = []
for address_type, group in df.groupby(by=["address_type"]):
address_dic = {}
address_dic["address_type"] = address_type
address_dic["address"] = group.drop(columns=["name", "address_type"]).to_dict(orient="records")
dic["parsed_address"].append(address_dic)
dic

I think you can try having a dictionary or list (json_data in the code below) to keep track of a person's data and iterating throw each row of the dataframe using for _, row in df.iterrows():
import pandas as pd
df = pd.read_csv("file", delimiter='|')
print(df)
json_data = {}
for _, row in df.iterrows():
name = row["name"]
address_type = row["address_type"]
address_line_1 = row["address_line_1"]
city = row["city"]
state = row["state"]
postal_code = row["postal_code"]
country = row["country"]
if name not in json_data:
json_data[name] = {
"name": name,
"parsed_address": []
}
address_list = None
for address in json_data[name]["parsed_address"]:
if address["address_type"] == address_type:
address_list = address
if address_list is None:
address_list = {
"address_type": address_type,
"address": []
}
json_data[name]["parsed_address"].append(address_list)
address_list["address"].append({
"address_line_1": address_line_1,
"city": city,
"state": state,
"postal_code": postal_code,
"country": country
})
lst = list(json_data.values())
# Verify data parsing
import json
print(json.dumps(lst, indent=2))

dic = {}
g_cols = ['id','first_name','last_name','address_type]
for name, group in df.groupby(g_cols)["address"]:
id = name[0]
dic["id"] = id
dic["parsed_address"] = []
for address_type, group in df.groupby(by=["address_type"]):
address_dic = {}
address_dic["address_type"] = address_type
address_dic["address"] = group.drop(
columns=["id", "first_name","last_name","address_type"]).to_dict("record")
dic["parsed_address"].append(address_dic)

Related

How to add duplicate columns together after converting from excel to json in python?

I have excel file in the format :
Name
Question
Answer
N1
Q1
a1
N2
Q2
a2
N3
Q3
a3
N4
Q4
a4
N3
Q5
a3
Here some name are same and their correspondings answers are also same. I want to convert this into json in the format where all the columns with same name are merged.
{
{
"name":"N1",
"exampleSentences": ["Q1"],
"defaultReply": {
"text": ["a1"],
"type": "text"
}
},
{
"name":"N2",
"exampleSentences": ["Q2"],
"defaultReply": {
"text": ["a2"],
"type": "text"
}
},
{
"name":"N3",
"exampleSentences": ["Q3","Q5"],
"defaultReply": {
"text": ["a3"],
"type": "text"
}
},
{
"name":"N4",
"exampleSentences": ["Q4"],
"defaultReply": {
"text": ["a4"],
"type": "text"
}
},
}
Here is the code that I wrote:
# Import the required python modules
import pandas as pd
import math
import json
import csv
# Define the name of the Excel file
fileName = "FAQ_eng"
# Read the Excel file
df = pd.read_excel("{}.xlsx".format(fileName))
intents = []
intentNames = df["Name"]
# Loop through the list of Names and create a new intent for each row
for index, name in enumerate(intentNames):
if name is not None:
exampleSentences = []
defaultReplies = []
if df["Question"][index] is not None and df["Question"][index] is not float:
try:
exampleSentences = df["Question"][index]
exampleSentences = [exampleSentences]
defaultReplies = df["Answer"][index]
defaultReplies = [defaultReplies]
except:
continue
intents.append({
"name": name,
"exampleSentences": exampleSentences,
"defaultReply": {
"text": defaultReplies,
"type": "text"
}
})
# Write the list of created intents into a JSON file
with open("{}.json".format(fileName), "w", encoding="utf-8") as outputFile:
json.dump(intents, outputFile, ensure_ascii=False)
My code adds another json data
{
"name":"N3",
"exampleSentences": ["Q5"],
"defaultReply": {
"text": ["a3"],
"type": "text"
}
instead of merging Q3 and Q5. What should I do?
The problem in your code is you are iterating through a set of items and at every iteration you should check the previous items to see if your current element is already present. You can avoid this problem if you use an initially empty dictionary d storing key, value pairs in the form d[name] = {"exampleSentences": [question], "text": [answer]}. You can iterate so over df["Name"] like below:
intentNames = df["Name"]
d = {}
# Loop through intentNames and create the dictionary
for index, name in enumerate(intentNames):
question = df["Question"][index]
answer = df["Answer"][index]
if name not in d:
d[name] = {"exampleSentences": [question], "text": [answer]}
else:
d[name]["exampleSentences"].append(question)
Then you can use the created dictionary to create the json file with the expected output like below:
intentNames = df["Name"]
d = {}
# Loop through intentNames and create the dictionary
for index, name in enumerate(intentNames):
question = df["Question"][index]
answer = df["Answer"][index]
if name not in d:
d[name] = {"exampleSentences": [question], "text": [answer]}
else:
d[name]["exampleSentences"].append(question)
#create the json array file
intents = []
for k, v in d.items():
intents.append({
"name": k,
"exampleSentences": v['exampleSentences'],
"defaultReply": {
"text": v['text'],
"type": "text"
}
})
# Write the list of created intents into a JSON file
with open("{}.json".format(fileName), "w", encoding="utf-8") as outputFile:
json.dump(intents, outputFile, ensure_ascii=False)

How can I iterate through a dictionary and use context managers in Python?

The dictionary I am trying to iterate through has the following structure:
d = {
"main_key_1": {
"name": "Name1",
"context": "Context1",
"message": "Message1",
"date": "Date1",
"reference": "Reference1"
},
"main_key_2": {
"name": "Name2",
"context": "Context2",
"message": "Message2",
"date": "Date2",
"reference": "Reference2"
}
}
This is the way I tried to iterate:
for item in d.items():
from_context = f"from {item[1]['context']}"
with context('given a descriptor'):
with context(from_context):
with before.all:
self.descriptor = item[1]['message']
with context('that contains a date'):
with it('recognizes the date'):
adapter = MessageToDatetAdapter(self.descriptor)
result = adapter.is_a_date()
expect(result).to(equal(True))
with it('extracts the date data'):
adapter = MessageToDatetAdapter(self.descriptor)
result = adapter.adapt()
expect(result['date']).to(equal(item[1]['date']))
expect(result['reference']).to(item[1]['reference'])
The first iteration would be something like below:
with context('given a descriptor'):
with context('from Context1'):
with before.all:
self.descriptor = 'Message1'
with context('that contains a date'):
with it('recognizes the date'):
adapter = MessageToDatetAdapter(self.descriptor)
result = adapter.is_a_date()
expect(result).to(equal(True))
with it('extracts the date data'):
adapter = MessageToDatetAdapter(self.descriptor)
result = adapter.adapt()
expect(result['date']).to('Date1')
expect(result['reference']).to('Reference1')
However, it seems like this is not correct. It looks like I cannot iterate through all the dictionary items.

Create multiple JSON files from CSV by grouping categories

here is a CSV file :
year,product,price
2021,P01,50
2022,P03,60
2021,P02,30
I'm trying to create a JSON for every year with the list of product like this :
{
"year": "2021",
"products": {
"P02": 30,
"P01": 50
},
"processed": "true"
}
Here is my actual code :
import json
csv = """2021,P01,50
2022,P03,60
2021,P02,30
"""
response = {}
for line in csv.splitlines():
fields = line.split(",")
year, product, price = fields[0], fields[1], fields[2:]
if year not in response:
response[year] = {}
response[year][product] = price
print json.dumps(response)
This is the result I get :
{
"2021": {
"P02": [
"30"
],
"P01": [
"50"
]
},
"2022": {
"P03": [
"60"
]
}
}
Could you help me please to get the result I'm waiting for ?
I start to think that I should maybe use List to make it ...
If the same product in the same year does not have different values then you can create a structure like-
{
"2021": {
"P0": 50,
"P1": 30
},
"2022": {
"P0": 60
}
}
For creating a structure like that
import json
csv = """2021,P01,50
2022,P03,60
2021,P02,30
"""
response = {}
for line in csv.splitlines():
fields = line.split(",")
year, product, price = fields[0], fields[1], fields[2:]
year_response = response.get(year, {})
year_response[product] = price
response[year] = year_response
# iterate the dictionary and create your custom response
for year, year_response in response.items():
file_data = {}
file_date["year"] = year
file_data["products"] = year_response
file_data["processed"] = true
#TODO: add file_data to file now
If the same product in the same year has different values then you can simply use a list instead of a integer value for "P0"

Appending data from dataframe to JSON

I am attempting to generate a JSON output from a dataframe and I'm not sure why the values are not being assigned separately:
emails = []
if len(df) > 0:
for index, name in df.iterrows():
try:
value = str(df.names)
except:
continue
email = {
"names": value
}
emails.append(email)
print(json.dumps(emails, indent=4))
Output:
[
{
"names": "0 abby\n1 josh\n2 john\n3 heather\n4 justin\nName: value, dtype: object"
},
{
Desired output"
[
{
"names": abby
},
{
"names": josh
},
{
"names": john
}
]
Use pandas.DataFrame.to_json()
names = df[["names"]]
print(names.to_json(orient="records"))
Refer documentation
emails = []
if len(email_list) > 0:
for index, student in email_list.iterrows():
try:
students = int(ids,student_ids)
except:
continue
email = {
"StudentEmails": [student.email],
"student": students,
}
emails.append(email)
print(json.dumps(emails, indent=4))
You need to read the actual data(name.names) not the whole column(df.names)
try:
if len(df) > 0:
for index, name in df.iterrows():
try:
value = name.names
except:
continue
email = {
"names": value
}
emails.append(email)
print(json.dumps(emails, indent=4))
Instead of
value = str(df.names)
Use
value = list(df.names)
Series object are not serializable by default but list are serializable. For example this code:
df = pd.DataFrame({"a": [1, 2,4,5]})
json.dumps(list(df.a))
will output:
'[1, 2, 4, 5]'

How to change numerical data to custom text words in CSV file

The below query is grabbing data and creating a CSV file, the issue that I am having is that the source called ‘SPLE’ stores data in the database with numbers of 0, 1, 50.
However in the CSV those numbers are being collected in the CSV and I would like somehow when creating the CSV those number to represent words such as,
0 = True
1 = False
50 = Pending
Could someone show me how this is done please, I have been struggling on this?
My Code:
from elasticsearch import Elasticsearch
import csv
es = Elasticsearch(["9200"])
res = es.search(index="search", body=
{
"_source": ["DTDT", "TRDT", "SPLE", "RPLE"],
"query": {
"bool": {
"should": [
{"wildcard": {"CN": "TEST*"}}
]
}
}
}, size=10)
header_names = { 'DTDT': 'DATE', 'SPLE': 'TAG', ...}
with open('mycsvfile.csv', 'w') as f:
header_present = False
for doc in res['hits']['hits']:
my_dict = doc['_source']
if not header_present:
w = csv.DictWriter(f, my_dict.keys())
w.writerow(header_names)
header_present = True
w.writerow(my_dict)
The output in the CSV file is:
Date SPLE Venue
20171016 1 Central
20171016 0 Central
20171016 50 Central
I'm assuming mycsvfile.csv file have SPLE column.
from elasticsearch import Elasticsearch
es = Elasticsearch(["9200"])
res = es.search(index="search", body=
{
"_source": ["DTDT", "TRDT", "SPLE", "RPLE"],
"query": {
"bool": {
"should": [
{"wildcard": {"CN": "TEST*"}}
]
}
}
}, size=10)
import pandas as pd
SPLE = {0:'true',1:'false',50:'pending'}
saved_csv = pd.read_csv('mycsvfile.csv',sep='\t')
saved_csv['SPLE'] = saved_csv['SPLE'].map(lambda x: SPLE[int(x)])
saved_csv.to_csv('edited_csv.csv', index=False)
Declare a dict somewhere for doing the translation:
SPLE_TRANSLATION = {0: 'True', 1: 'False', 50: 'Pending'}
Then, inside your loop:
my_dict['SPLE'] = SPLE_TRANSLATION[my_dict['SPLE']]
w.writerow(my_dict)

Categories

Resources