Save Dictionaries to csv python - python

I am trying to save below dictionary to csv. Format of my dictionary content is:
dict = {0:[u'ab', u'cd'], 1:[u'b'], 2: [u'Ge', u'TT'], 3: [u'Stas'], 4: [u'sap', u'd3', u'ch99']}.
My code is:
with open('Cr_pt.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(dict.keys())
writer.writerows(zip(*dict.values())).
Below is the format I am trying to save in csv.
1 2 3 4
ab Ge Stas sap
cd TT d3
ch99
However I am getting only few values from dictionary in my final csv such as:
1 2 3 4
ab Ge Stas sap

Firstly, you should avoid using dict as a variable name as it shadows a built-in.
The problem at hand is straightforward to solve with pandas;
import pandas as pd
df = pd.DataFrame.from_dict(dict, orient='index').T.to_csv('Cr_pt.csv', index=False)
Output:
$ cat Cr_pt.csv
0,1,2,3,4
ab,b,Ge,Stas,sap
cd,,TT,,d3
,,,,ch99

Here's one solution that writes to a CSV file directly without using the csv module and delimits each row with commas, creating a file of comma-separated values (CSV):
data = {0:[u'ab', u'cd'], 1:[u'b'], 2: [u'Ge', u'TT'], 3: [u'Stas'], 4: [u'sap', u'd3', u'ch99']}
delimiter = ","
with open('data.csv', 'w') as data_file:
keys = [str(key) for key in data.keys()]
values = data.values()
max_values = max([len(value) for value in values])
data_file.write(delimiter.join(keys) + "\n")
for line in range(0, max_values):
line_values = []
for key in keys:
try:
line_values.append(data[int(key)][line])
except:
line_values.append("")
data_file.write(delimiter.join(line_values) + "\n")
Output:
0,1,2,3,4
ab,b,Ge,Stas,sap
cd,,TT,,d3
,,,,ch99
In this format, it would be easier to read the data back from the file to another script, since the values are only comma-separated, not delimited by variable length whitespaces.

Related

How to do vlookup without pandas using python script

I am having two csv files where I need a python code to do a vlookup that does match the values and takes only the needed column and creates a new csv file. I know it can be done with pandas but I need it to do this without pandas or any 3rd party tools.
INPUT 1 csv file
ID NAME SUBJECT
1 Raj CS
2 Allen PS
3 Bradly DP
4 Tim FS
INPUT 2 csv file
ID COUNTRY TIME
2 USA 1:00
4 JAPAN 14:00
1 ENGLAND 5:00
3 CHINA 0.00
OUTPUT csv file
ID NAME SUBJECT COUNTRY
1 Raj CS ENGLAND
2 Allen PS USA
3 Bradly DP CHINA
4 Tim FS JAPAN
Probably a more efficient way to do it, but basically create a nested dictionary (using the ID as the key) with the other column names and their values under the ID key. Then when you iterate through each file, it'll update the dictionary on the ID key.
Finally put them together into a list and write to file:
input_files = ['C:/test/input_1.csv', 'C:/test/input_2.csv']
lookup_column_name = 'ID'
output_dict = {}
for file in input_files:
file = open(file, 'r')
header = {}
# Read each line in the csv
for idx, line in enumerate(file.readlines()):
# If it's the first line, store as the header
if idx == 0:
header = line.split(',')
# Get the index value of the lookup column from the list of headers
header_dict = {idx:x.strip() for idx, x in enumerate(header)}
lookup_column_idx = dict((v,k) for k,v in header_dict.items())[lookup_column_name]
continue
line_split = line.split(',')
# Initialize the dictionary by look up column
if line_split[lookup_column_idx] not in output_dict.keys():
output_dict[line_split[lookup_column_idx]] = {}
# If not the lookup column, then add the other column and data to the dictionary
for idx, value in enumerate(line_split):
if idx != lookup_column_idx:
output_dict[line_split[lookup_column_idx]].update({header_dict[idx]:value})
# Create a list of the rows that will be written to file under the correct columns
rows = []
for k, v in output_dict.items():
header = [lookup_column_name] + list(v.keys())
row = [k] + [output_dict[k][x].strip() for x in header if x != lookup_column_name]
row = ','.join(row) + '\n'
rows.append(row)
# Final list of rows, begining with the header
output_lines = [','.join(header) + '\n'] + rows
# writing to file
output = open('C:/test/output.csv', 'w')
output.writelines(output_lines)
output.close()
To do this without pandas (and assuming you know the structure of your data + it fits in memory), you can iterate through the csv file and store the results in a dictionary, where you fill the entries where the ID maps to the other information that you want to keep.
You can do this for both csv files and join them manually afterwards by iterating over the keys of the dictionary.
input1='.\file1.csv'
input2='.\file2.csv'
with open(input1,'r',encoding='utf-8-sig') as inuputlist:
with open(input2, "r",encoding='utf-8-sig') as inputlist1:
with open('.\output.csv','w',newline='',encoding='utf-8-sig') as output:
reader = csv.reader(inputlist)
reader2 = csv.reader(inputlist1)
writer = csv.writer(output)
dict1 = {}
for xl in reader2:
dict1[xl[0]] = xl[1]
for i in reader:
if i[2] in dict1:
i.append(dict1[i[2]])
writer.writerow(i)
else:
i.append("N/A")
writer.writerow(i)

How to access, edit & save the text of a list inside another list?

import csv
#!pip install tweet-preprocessor
import preprocessor as p
p.set_options(p.OPT.MENTION,p.OPT.URL, p.OPT.EMOJI)
f = open('April 9.csv', 'r')
d = [i for i in csv.reader(f) ]
for i in d:
print(p.clean(d[i]))
April 9.csv is a csv file having only one column of text but up to 4000 rows.
clean() takes in a string, but according to what i have here, the program is
just returning a list to clean(). I want to pass a string to clean() which corresponds to the rows of the csv file and consequently save it as a csv
file.
Updated for Martinaeu's Comment
Using readline/write
Just write each cleaned line to your new file as you read it from the CSV:
#!pip install tweet-preprocessor
import preprocessor as p
p.set_options(p.OPT.MENTION,p.OPT.URL, p.OPT.EMOJI)
with open('April 9.csv', 'r') as rf, open('New File.csv', 'w+') as wf:
for line in rf:
wf.write(p.clean(line))
Using csv (for the sake of explanation)
d = [i for i in csv.reader(f)] is creating a list of lists because you are working with 2 dimensions in a CSV, rows and columns. If you had a CSV like
A1, A2
B1, B2
C1, C2
...
then d would contain
[
['A1', 'A2'],
['B1', 'B2'],
['C1', 'C2'],
...
]
In your specific case, d is
[
['<Row 1 value>'],
['<Row 2 value>'],
['<Row 3 value>'],
...
['<Row 4000 value>']
]
You need to change your list comprehension to only clean() the single column's value
d = [p.clean(i[0]) for i in csv.reader(f)] # Option 1: 1-D list
or
d = [[p.clean(i[0])] for i in csv.reader(f)] # Option 2: 2-D list
The first option prevents you from storing a 2-D list since you're only working with a single column, but note that writerow() below takes in a list of columns to write, so you can go either route.
Once you have the list you want to store, you can then write it to another CSV file like so:
with open('New CSV.csv', 'w') as f:
writer = csv.writer(f)
for i in d:
writer.writerow([i]) # If you use option 1
# writer.writerow(i) - If you use option 2

XLSXwriter - writing multiple nested dictionaries

I need to write a multi-nested dictionary to an Excel file. The dictionary is structured as dict1:{dict2:{dict3:value}}}. My third write loop is raising a keyerror: '' even though there ought not be any blank keys.
I attempted to use this abhorrent monster as it has worked wonderfully for smaller dictionaries, but 1) there has to be a better way 2) I'm unable to scale it for this dictionary...
import xlsxwriter
workbook = xlsxwriter.Workbook('datatest.xlsx')
worksheet = workbook.add_worksheet('test1')
row = 0
col = 0
for key in sam_bps.keys():
row += 1
worksheet.write(row, col, key)
for key in sam_bps[sam].keys():
row, col = 0,1
worksheet.write(row,col,key)
row += 1
for key in sam_bps[sam][bp].keys():
row,col = 0,2
worksheet.write(row,col,key)
row += 1
for key in sam_bps[sam][bp][mpn].keys():
row,col = 0,3
worksheet.write(row,col,key)
row += 1
for item in sam_bps[sam][bp][mpn].keys():
row,col = 0,4
worksheet.write(row,col,item)
row += 1
workbook.close()
I've also considered converting the dictionary to a list of tuples or list of lists, but it doesn't output how I need. And it'll probably cost more time to split those open afterwards anyway.
Here's the code for the dicionary:
sam_bps = {}
sam_bps_header = ['SAM','BP','MPN','PLM_Rate']
for row in plmdata:
sam,mpn,bp,doc = row[24],row[5],row[7],row[2]
if sam == '':
sam = 'Unknown'
if doc == 'Requirement':
if sam not in sam_bps:
sam_bps[sam] = {bp:{mpn:heatscores[mpn]}}
elif bp not in sam_bps[sam]:
sam_bps[sam][bp] = {mpn:heatscores[mpn]}
elif mpn not in sam_bps[sam][bp]:
sam_bps[sam][bp][mpn] = heatscores[mpn]
print(sam_bps['Dan Reiser'])
EDIT: Added print statement to show output per feedback
{'Fortress Solutions': {'MSM5118160F60JS': 45}, 'Benchmark Electronics': {'LT1963AES8': 15}, 'Axxcss Wireless Solutions Inc': {'MGA62563TR1G': 405}}
I'd like to see this output to an Excel file, with the first column of course being the first key of sam_bps
Your question would be easier to answer if you provided an example of the dictionary you are trying to save.
Have you considered just serializing/deserializing the dictionary to JSON format?
you can save/re-load the file with minimal code:
import json
data = {'test': {'test2': {'test3':2}}}
with open('data.json', 'w') as outfile:
json.dump(data, outfile)
with open('data.json') as data_file:
data = json.load(data_file)

Merge 2 csv file with one unique column but different header [duplicate]

This question already has answers here:
Merging two CSV files using Python
(2 answers)
Closed 7 years ago.
I want to merge 2 csv file using some scripting language (like bash script or python).
1st.csv (this data is from mysql query)
member_id,name,email,desc
03141,ej,ej#domain.com,cool
00002,jes,jes#domain.com,good
00002,charmie,charm#domain.com,sweet
2nd.csv (from mongodb query)
id,address,create_date
00002,someCity,20150825
00003,newCity,20140102
11111,,20150808
The examples are not the actual, though i know that some of the member_id from qsl and the id from mongodb are the same.
(*and i wish my output will be something like this)
desiredoutput.csv
meber_id,name,email,desc,address,create_date
03141,ej,ej#domain.com,cool,,
00002,jes,jes#domain.com,good,someCity,20150825
00002,charmie,charm#domain.com,sweet,
11111,,,,20150808
help will be much appreciated. thanks in advance
#########################################################################
#!/usr/bin/python
import csv
import itertools as IT
filenames = ['1st.csv', '2nd.csv']
handles = [open(filename, 'rb') for filename in filenames]
readers = [csv.reader(f, delimiter=',') for f in handles]
with open('desiredoutput.csv', 'wb') as h:
writer = csv.writer(h, delimiter=',', lineterminator='\n', )
for rows in IT.izip_longest(*readers, fillvalue=['']*2):
combined_row = []
for row in rows:
row = row[:1] # column where 1 know there are identical data
if len(row) == 1:
combined_row.extend(row)
else:
combined_row.extend(['']*1)
writer.writerow(combined_row)
for f in handles:
f.close()
#########################################################################
just read and tried this code(manipulate) in this site too
Since you haven't posted an attempt, I'll give you a general answer (using Python) to get you started.
Create a dict, d
Iterate over all the rows of the first file, convert each row into a list and store it in d using meber_id as the key and the list as the value.
Iterate over all the rows of the second file, convert each row into a list leaving out the id column and update the list under d[id] with the new list if d[id] exists, otherwise store the new list under d[id].
Finally, iterate over the values in d and print them out comma separated to a file.
Edit
In your attempt, you are trying to use izip_longest to iterate over the rows of both files at the same time. But this would work only if there were an equal number of rows in both files and they were in the same order.
Anyhow, here is one way of doing it.
Note: This is using the Python 3.4+ csv module. For 2.7 it might look a little different.
import csv
d = {}
with open("file1.csv", newline="") as f:
for row in csv.reader(f):
d.setdefault(row[0], []).append(row + [""] * 3)
with open("file2.csv", newline="") as f:
for row in csv.reader(f):
old_row = d.setdefault(row[0][0], [row[0], "", "", ""])
old_row[4:] = row[1:]
with open("out.csv", "w", newline="") as f:
writer = csv.writer(f)
for rows in d.values():
writer.writerows(rows)
Here goes a suggestion using pandas I've got from this answer and pandas doc about merging.
import pandas as pd
first = pd.read_csv('1st.csv')
second = pd.read_csv('2nd.csv')
merged = pd.concat([first, second], axis=1)
This will output:
meber_id name email desc id address create_date
3141 ej ej#domain.com cool 2 someCity 20150825
2 jes jes#domain.com good 11 newCity 20140102
11 charmie charm#domain.com sweet 11111 NaN 20150808

Comparing two csv files and getting difference

I have two csv file I need to compare and then spit out the differnces:
CSV FORMAT:
Name Produce Number
Adam Apple 5
Tom Orange 4
Adam Orange 11
I need to compare the two csv files and then tell me if there is a difference between Adams apples on sheet and sheet 2 and do that for all names and produce numbers. Both CSV files will be formated the same.
Any pointers will be greatly appreciated
I have used csvdiff
$pip install csvdiff
$csvdiff --style=compact col1 a.csv b.csv
Link to package on pypi
I found this link useful
If your CSV files aren't so large they'll bring your machine to its knees if you load them into memory, then you could try something like:
import csv
csv1 = list(csv.DictReader(open('file1.csv')))
csv2 = list(csv.DictReader(open('file2.csv')))
set1 = set(csv1)
set2 = set(csv2)
print set1 - set2 # in 1, not in 2
print set2 - set1 # in 2, not in 1
print set1 & set2 # in both
For large files, you could load them into a SQLite3 database and use SQL queries to do the same, or sort by relevant keys and then do a match-merge.
One of the best utilities for comparing two different files is diff.
See Python implementation here: Comparing two .txt files using difflib in Python
import csv
def load_csv_to_dict(fname, get_key, get_data):
with open(fname, 'rb') as inf:
incsv = csv.reader(inf)
incsv.next() # skip header
return {get_key(row):get_data(row) for row in incsv}
def main():
key = lambda r: tuple(r[0:2])
data = lambda r: int(r[2])
f1 = load_csv_to_dict('file1.csv', key, data)
f2 = load_csv_to_dict('file2.csv', key, data)
f1keys = set(f1.iterkeys())
f2keys = set(f2.iterkeys())
print("Keys in file1 but not file2:")
print(", ".join(str(a)+":"+str(b) for a,b in (f1keys-f2keys)))
print("Keys in file2 but not file1:")
print(", ".join(str(a)+":"+str(b) for a,b in (f2keys-f1keys)))
print("Differing values:")
for k in (f1keys & f2keys):
a,b = f1[k], f2[k]
if a != b:
print("{}:{} {} <> {}".format(k[0],k[1], a, b))
if __name__=="__main__":
main()
If you want to use Python's csv module along with a function generator, you can use nested looping and compare large .csv files. The example below compares each row using a cursory comparision:
import csv
def csv_lazy_get(csvfile):
with open(csvfile) as f:
r = csv.reader(f)
for row in r:
yield row
def csv_cmp_lazy(csvfile1, csvfile2):
gen_2 = csv_lazy_get(csvfile2)
for row_1 in csv_lazy_get(csvfile1):
row_2 = gen_2.next()
print("row_1: ", row_1)
print("row_2: ", row_2)
if row_2 == row_1:
print("row_1 is equal to row_2.")
else:
print("row_1 is not equal to row_2.")
gen_2.close()
Here a start that does not use difflib. It is really just a point to build from because maybe Adam and apples appear twice on the sheet; can you ensure that is not the case? Should the apples be summed, or is that an error?
import csv
fsock = open('sheet.csv','rU')
rdr = csv.reader(fsock)
sheet1 = {}
for row in rdr:
name, produce, amount = row
sheet1[(name, produce)] = int(amount) # always an integer?
fsock.close()
# repeat the above for the second sheet, then compare
You get the idea?

Categories

Resources