CSV to JSON array conversion - python

I am trying to convert CSV file to JSON.
CSV File:
id,name,email
1,jim,test#gmail.com
1,jim,test2#gmail.com
2,kim,test3#gmail.com
Expected output
{"row" : {"id":1,"name":"jim","email": ["test#gmail.com","test1#gmail.com"]}},
{"row" : {"id":2,"name":"kim","email": "test3#gmail.com"}}

Here a little bit bulky implementation
import csv
import json
with open('data.csv') as csvfile:
reader = csv.reader(csvfile)
# Get headers
headers = next(reader, None)
result = {}
for row in reader:
# Combine header and line to get a dict
data = dict(zip(headers, row))
if data['id'] not in result:
data.update({'email': [data.pop('email')]})
result[data['id']] = data
else:
# Aware if id and name fields are not consistant
assert data['name'] == result[data['id']]['name']
result[data['id']]['email'].append(data['email'])
for rec in result.values():
try:
# try to unpack as a single value and if it fails leave as is
rec['email'], = rec['email']
except ValueError:
pass
print(json.dumps({'row': rec}))

You can use pandas to do this:
import pandas as pd
df = pd.read_csv('test.csv', index_col=None)
print(df)
#Output
id name email
0 1 jim test#gmail.com
1 1 jim test2#gmail.com
2 2 kim test3#gmail.com
df1 = df.groupby(['id', 'name'])['email'].apply(list).reset_index()
df_json = df1.to_json(orient='index')
print(df_json)
#Output:
{"0":{"id":1,"name":"jim","email":["test#gmail.com","test2#gmail.com"]},"1":{"id":2,"name":"kim","email":["test3#gmail.com"]}}

Related

Skip First Column in CSV File with Pandas

I have a csv file that is generated that has some information in the first line. I'm trying to skip it but it doesn't seem to work. I tried looking at several suggestions and examples.
I tried using skiprows.
I also looked at several other examples.
Pandas drop first columns after csv read
https://datascientyst.com/pandas-read-csv-file-read_csv-skiprows/
Nothing I tried worked the way I wanted it.
When I got it to work it deleted the entire row.
Here is a sample of the code
# Imports the Pandas Module. It must be installed to run this script.
import pandas as pd
# Gets source file link
source_file = 'Csvfile.csv'
# Gets csv file and encodes it into a format that is compatible.
dataframe = pd.read_csv(source_copy, encoding='latin1')
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Grayscale, 'Duplex': dataframe.Duplex, 'Printer': dataframe.Printer})
# Formats data so that it can be used to count Duplex and Color pages.
df.loc[df["Duplex"] == "DUPLEX", "Duplex"] = dataframe.Pages
df.loc[df["Duplex"] == "NOT DUPLEX", "Duplex"] = 0
df.loc[df["Color"] == "NOT GRAYSCALE", "Color"] = dataframe.Pages
df.loc[df["Color"] == "GRAYSCALE", "Color"] = 0
df.sort_values(by=['User', 'Pages'])
file = df.to_csv('PrinterLogData.csv', index=False)
# Opens parsed CSV file.
output_source = "PrinterLogData.csv"
dataframe = pd.read_csv(output_source, encoding='latin1')
# Creates new DataFrame.
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Color, 'Duplex': dataframe.Duplex, 'Printer':
dataframe.Printer})
# Groups data by Users and Printer Sums
Report1 = df.groupby(['User'], as_index=False).sum().sort_values('Pages', ascending=False)
Report2 = (df.groupby(['Printer'], as_index=False).sum()).sort_values('Pages', ascending=False)
Sample Data
Sample Output of what I'm looking for.
This is an early draft of what you appear to want for your program (based on the simulated print-log.csv):
import csv
import itertools
import operator
import pathlib
CSV_FILE = pathlib.Path('print-log.csv')
EXTRA_COLUMNS = ['Pages', 'Grayscale', 'Color', 'Not Duplex', 'Duplex']
def main():
with CSV_FILE.open('rt', newline='') as file:
iterator = iter(file)
next(iterator) # skip first line if needed
reader = csv.DictReader(iterator)
table = list(reader)
create_report(table, 'Printer')
create_report(table, 'User')
def create_report(table, column_name):
key = operator.itemgetter(column_name)
table.sort(key=key)
field_names = [column_name] + EXTRA_COLUMNS
with pathlib.Path(f'{column_name} Report').with_suffix('.csv').open(
'wt', newline=''
) as file:
writer = csv.DictWriter(file, field_names)
writer.writeheader()
report = []
for key, group in itertools.groupby(table, key):
report.append({column_name: key} | analyze_group(group))
report.sort(key=operator.itemgetter('Pages'), reverse=True)
writer.writerows(report)
def analyze_group(group):
summary = dict.fromkeys(EXTRA_COLUMNS, 0)
for row in group:
pages = int(row['Pages']) * int(row['Copies'])
summary['Pages'] += pages
summary['Grayscale'] += pages if row['Grayscale'] == 'GRAYSCALE' else 0
summary['Color'] += pages if row['Grayscale'] == 'NOT GRAYSCALE' else 0
summary['Not Duplex'] += pages if row['Duplex'] == 'NOT DUPLEX' else 0
summary['Duplex'] += pages if row['Duplex'] == 'DUPLEX' else 0
return summary
if __name__ == '__main__':
main()

How can I convert JSON format text to dataframe?

I am trying to convert below JSON format text to pandas or spark data frame, but it is giving below error.
ERROR: JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Python CODE:
# import pandas to read json file
import json
path = "sample.json"
with open(path, 'r') as myfile:
data=myfile.read()
data = data.replace('\t','')
data = data.replace('\n','')
data = data.replace(',}','}')
data = data.replace(',]',']')
obj = json.loads(data)
JSON file format
Output of data after reading .json file by using open function
How can I convert above text as a data frame?
I got, I added few lines of code
path = "sample.json"
with open(path, 'r') as myfile:
data=myfile.read()
data = data.replace('\t','')
data = data.replace('\n','')
data = data.replace(',}','}')
data = data.replace(',]',']')
data = data.replace("null", "''")
liss = []
data1 = data[1:-1]
data2 = data1.split("},")
for i in data2:
last_value = i[len(i)-1]
if last_value != "}":
new_text = i+"}"
liss.append(new_text)
else:
new_text = i
liss.append(new_text)
sample_df = pd.DataFrame({"Col1":liss})
sample_df["Col1"] = sample_df["Col1"].apply(lambda x : dict(eval(x)) )
df3 = sample_df["Col1"].apply(pd.Series )
df3
I think you can read the json and save it in a dictionary.
Once you have this dictionary you can create a spark dataframe with the following line of code
df = spark.createDataFrame(dict)

Python CSV get particular row data

This is csv file
name,country,code
Georgina,Saint Helena,ET
Brooks,Austria,LR
Rosaline,Peru,DZ
How to get a particular row data without looping the whole csv file?
Looking for following syntax:
If searchName exist in csv, extract the data
searchName = 'Brooks'
with open('name.csv', 'r') as file:
reader = csv.DictReader(file)
for row in reader:
if (row['name']) == searchName :
print(row['name'] + ' >> ' + row['country'])
Thanks
Update panda solution for those who interested
import pandas as pd
df = pd.read_csv('a.csv')
select_row = df.loc[df['name'] == 'Brooks']
if select_row.empty:
print('No records')
else:
print('Print Record')
print(select_row.country)
Get first instance
search_name = 'Brooks'
with open('name.csv', 'r') as file:
output = re.search(f'{search_name}.*', file.read())
row = output.group().split(',')
print(row[0], '>>' ,row[1])
Get all instances
search_name = 'Brooks'
with open('name.csv', 'r') as file:
output = re.findall(f'{search_name}.*', file.read())
for row in output:
items = row.split(',')
print(items[0], '>>' ,items[1])
Using DataFrames
import pandas as pd
search_name = 'Brooks'
df = pd.read_csv('name.csv')
output = df[df.name == search_name].iloc[0]
print(output['name'], '>>', output['country'])
You could try using pandas and make your life easier, try something like this :
import pandas as pd
df = pd.read_csv('name.csv')
if df.iloc[5, 6]:
# execute condition
else
# execute another condition
I have given you an outline,you can try to use this and come up with a solution for your issue.
Although dataframe seems to be the best option, if you treat the csv as a simple text file, This should help you:
searchName = 'Brooks'
with open('name.csv', 'r') as f:
foo = f.read()
items=re.findall(f"{searchName}.*$",foo,re.MULTILINE)
print(items)
Output:
['Brooks,Austria,LR']

Summing values from duplicate keys in a CSV file without panda

I have a large dataset that looks like the following
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
I would like to output :
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
Here is my python code so far:
headers = ["valuation_date","party_group_name","type","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open(t1file,'rb') as f:
reader = csv.reader(f)
headers = reader.next()
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
if cp in data.keys():
if new_qualifier in data.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
else:
data[cp] = cp
else:
data[party] = party
When I run the above code I get the following error:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
TypeError: string indices must be integers, not str
Very rusty with python apologize if it's glaringly obivous but any insights as to what i'm doing wrong ?
Thanks !
you can use pandas.drop_duplicates to drop duplicates of multiple columns and combine it with pandas.groupby() & sum to get the desired result
>>>import pandas as pd
>>>#read file using pandas.read_csv()
>>>df
party cp qualifier amount
0 ABC DEF GOOGLE_2 100
1 ABC DEF GOOGLE_2 200
2 GHI JKL FACEBOOK_1 500
3 GHI JKL FACEBOOK_1 -600
>>>df['Total'] = df.groupby(['party','cp','qualifier'])['amount'].transform('sum')
>>>print(df.drop_duplicates(subset=['party','cp','qualifier'], keep='last'))
party cp qualifier amount Total
1 ABC DEF GOOGLE_2 200 300
3 GHI JKL FACEBOOK_1 -600 -100
Below
from collections import defaultdict
PARTY_IDX = 0
CP_IDX = 1
QUALIFIER_IDX = 2
AMOUNT_IDX = 3
data = defaultdict(int)
with open('del-me.csv') as f:
lines = [l.strip() for l in f.readlines()]
for idx, line in enumerate(lines):
if idx > 0:
fields = line.split(',')
party = fields[PARTY_IDX]
cp = fields[CP_IDX]
qualifier = fields[QUALIFIER_IDX]
qualifier = qualifier[:qualifier.find('_')]
key = ','.join([party, cp, qualifier])
amount = int(fields[AMOUNT_IDX])
data[key] += amount
with open('out.csv', 'w') as f:
for k, v in data.items():
f.write('{},{}\n'.format(k, v))
del-me.csv
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
out.csv
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
You have already enough answers, but let me correct your own code to help you derive the answer and understand the original issue:
import csv as csv
headers = ["valuation_date","party_group_name","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open('test_data.csv','rt', encoding='utf-8') as f:
reader = csv.reader(f)
headers = next(reader)
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
cp_ = data[party]
if cp in cp_.keys():
qualifier_ = data[party][cp]
if new_qualifier in qualifier_.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = {}
else:
data[cp] = {}
else:
data[party] = {}
data[party][cp] = {}
data[party][cp][qualifier.split("_")[0]] = float(amount)
print(data)
This gives you
{'ABC': {'DEF': {'GOOGLE': 300.0}}, 'GHI': {'JKL': {'FACEBOOK': -100.0}}}
The problem was how you were populating your dictionary and how you were accessing it.
In order to simplify things, you might use just one key for the dict which is composed out of the identifying parts of a given line.
You might have to extract values by the header names like you already did. The following is based on the specified input. rsplit is used to split the string once at the end in order to use the party,cp,qualifier combination as a key and extract the amount.
def sumUp():
d = {}
with open(t1file,'rb') as f:
for line in f:
if 'party' in line:
continue # skip header
key, value = line.rsplit(',', 1) # split once at the end
d[key] = d[key] + int(value) if key in d else int(value)
You can do it like this:
from csv import DictReader, DictWriter
map_dic = dict()
with open('test1.csv', 'r') as fr:
csv_reader = DictReader(fr, delimiter=',')
for line in csv_reader:
key = '{}_{}_{}'.format(line['party'], line['cp'], line['qualifier'])
if key not in map_dic.keys():
map_dic[key] = {'party': line['party'], 'cp': line['cp'], 'qualifier': line['qualifier'], 'amount': int(line['amount'])}
else:
map_dic[key]['amount'] = map_dic[key]['amount'] + int(line['amount'])
with open('test2.csv', 'w') as csvfile:
writer = DictWriter(csvfile, fieldnames=['party', 'cp', 'qualifier', 'amount'])
writer.writeheader()
for key, data in map_dic.items():
writer.writerow(data)

How to read several rows from a csv

I have a csv file which contains among other things the names and the phone numbers. I'm only interested in a name only if I've its phone number.
with open(phone_numbers) as f:
reader = csv.DictReader(f)
names = [record['Name'] for record in reader if record['phone']]
But I also want the respective phone number, I've try this:
user_data = {}
with open(phone_numbers) as f:
reader = csv.DictReader(f)
user_data['Name'] = [record['Name'] for record in reader if record['phone']]
user_data['phone'] = [record['phone'] for record in reader if record['phone']]
But for the second item I got an empty string, I'm guessing that record is a generator and that's why I can iterate over it twice.
I've try to use tuples, but only had worked this way:
user_data = {}
with open(phone_numbers) as f:
reader = csv.DictReader(f)
user_data['Name'] = [(record['Name'],record['phone']) for record in reader if record['phone']]
In that case I have the two variables, phone and Name stored in user_data['Name'], that isn't what I want.
And if I try this:
user_data = {}
with open(phone_numbers) as f:
reader = csv.DictReader(f)
user_data['Name'],user_data['phone'] = [(record['Name'],record['phone']) for record in reader if record['phone']]
I got the following error:
ValueError: too many values to unpack
Edit:
This is a sample of the table:
+--------+---------------+
| Phone | Number |
+--------+---------------+
| Luis | 000 111 22222 |
+--------+---------------+
| Paul | 000 222 3333 |
+--------+---------------+
| Andrea | |
+--------+---------------+
| Jorge | 111 222 3333 |
+--------+---------------+
So all rows have a Name but not all have phones.
You can use dict to convert your list of tuple into dictionary. Also you need to use get if you have record without phone value.
import csv
user_data = {}
with open(phone_numbers) as f:
reader = csv.DictReader(f)
user_data = dict([(record['Name'], record['phone']) for record in reader if record.get('phone').strip())
If you want a list of names and phones separately you can use the * expression
with open(phone_numbers) as f:
reader = csv.DictReader(f)
names, phones = zip(*[(record['name'], record['value']) for record in reader if record.get('phone').strip()])
I think there is a much easier approach Because it is a csv file since there are column headings as you indicate then there is a value for phone in each row, it is either nothing or something - so this tests for nothing and if not nothing adds the name and phone to user_data
import csv
user_data = []
with open(f,'rb') as fh:
my_reader = csv.DictReader(fh)
for row in my_reader:
if row['phone'] != ''
user_details = dict()
user_details['Name'] = row['Name']
user_details['phone'] = row['phone']
user_data.append(user_details)
By using DictReader we are letting the magic happen so we don't have to worry about seek etc.
If I did not understand and you want a dictionary then easy enough
import csv
user_data = dict()
with open(f,'rb') as fh:
my_reader = csv.DictReader(fh)
for row in my_reader:
if row['phone'] != ''
user_data['Name'] = row['phone']
Your guess is quite right. If this is the approach you want take - iteration twice, you should use seek(0)
reader = csv.DictReader(f)
user_data['Name'] = [record['Name'] for record in reader if record['phone']]
f.seek(0) # role back to begin of file ...
reader = csv.DictReader(f)
user_data['phone'] = [record['phone'] for record in reader if record['phone']]
However, this is not very efficient and you should try and get your data in one roll. The following should do it in one roll:
user_data = {}
def extract_user(user_data, record):
if record['phone']:
name = record.pop('name')
user_data.update({name: record})
[extract_user(user_data, record) for record in reader]
Example:
In [20]: cat phones.csv
name,phone
hans,01768209213
grettel,
henzel,123457123
In [21]: f = open('phones.csv')
In [22]: reader = csv.DictReader(f)
In [24]: %paste
user_data = {}
def extract_user(user_data, record):
if record['phone']:
name = record.pop('name')
user_data.update({name: record})
[extract_user(user_data, record) for record in reader]
## -- End pasted text --
Out[24]: [None, None, None]
In [25]: user_data
Out[25]: {'hans': {'phone': '01768209213'}, 'henzel': {'phone': '123457123'}}
Is it possible that what you're looking for is throwing away some info in your data file?
In [26]: !cat data00.csv
Name,Phone,Address
goofey,,ade
mickey,1212,heaven
tip,3231,earth
In [27]: f = open('data00.csv')
In [28]: r = csv.DictReader(f)
In [29]: lod = [{'Name':rec['Name'], 'Phone':rec['Phone']} for rec in r if rec['Phone']]
In [30]: lod
Out[30]: [{'Name': 'mickey', 'Phone': '1212'}, {'Name': 'tip', 'Phone': '3231'}]
In [31]:
On the other hand, should your file contain ONLY Name and Phone columns, it's
just
In [31]: lod = [rec for rec in r if rec['Phone']]
I normally use row indexing:
input = open('mycsv.csv', 'r')
user_data = {}
for row in csv.reader(input):
if row[<row # containing phone>]:
name = row[<row # containing name>]
user_data[name] = row[<row # containing phone>]
You were correct the whole time, except for the unpacking.
result = [(record["name"], record["phone"]) for record in reader if record["phone"]]
# this gives [(name1, phone1), (name2,phone2),....]
You have to do [dostuff for name, phone in result] not name,phone = result, which does not make sense semantically and syntactically.

Categories

Resources