Building list of dicts from csv using Python - python

I have a CSV file containing three columns and many rows. All the data is Strings. I am trying to read the CSV one line at a time and convert each row to a Dict which I then want to append to a list so I have a List of Dicts. The environment is AWS Lambda and the CSV comes from an S3 bucket.
My code:
csv_object = s3.Object('MyBucket', 'My.csv')
csv_file = csv_object.get()['Body'].read().decode('utf-8')
f = StringIO(csv_file)
reader = csv.reader(f, delimiter=',')
list_of_json = []
mydevice = {}
for row in reader:
mydevice["device"] = row[0]
mydevice["serial"] = row[1]
mydevice["software"] = row[2]
list_of_json.append(mydevice)
The software runs (ie, doesn't error), but it doesn't produce the desired result. If I print(list_of_json) after the for loop completes, I want it to produce this;
[{"device":"Dev1", "serial":"Ser1", "software":"software1"},{.....}]
But what is actually produces is just an empty list... as if the append statement doesn't even exist;
[]
The CSV reading and for row in reader: parts all seem to work fine. If I do a print(mydevice) inside the for loop I can see it working its way through all the devices successfully, but for reasons I can't fathom, the append statement never seems to append anything to the list_of_json list.

Why not just use csv.DictReader
csv_object = s3.Object('MyBucket', 'My.csv')
csv_file = csv_object.get()['Body'].read().decode('utf-8')
f = StringIO(csv_file)
reader = csv.DictReader(f, ('device', 'serial', 'software'))
list_of_json = [dict(device) for device in reader]
#also don't forget to
f.close() #or use contextlib.closing

You need to create a new dictionary inside the loop:
csv_object = s3.Object('MyBucket', 'My.csv')
csv_file = csv_object.get()['Body'].read().decode('utf-8')
f = StringIO(csv_file)
reader = csv.reader(f, delimiter=',')
list_of_json = []
for row in reader:
mydevice = {}
mydevice["device"] = row[0]
mydevice["serial"] = row[1]
mydevice["software"] = row[2]
list_of_json.append(mydevice)

Related

Dictwriter delete rows without enumeration or looping

Trying to read in a CSV, adding a row at the bottom, and deleting a row at the top. I have not been able to find a way to delete a row in the dictwriter object without converting to a list, deleting the row in the list, then writing it out using csv.writer.
Should be a better way than reading/writing twice.
python3.8, ubuntu
Thx.
stime = get_time_str()
new_dict = {'Time': stime, 'Queries': Querycounter.value}
Querycounter.value = 0
# list of column names
field_names = ['Time', 'Queries']
# Open CSV file in append mode
#append the new queries count at the end of the file
with open(AlpacaQueriesCSVfile, 'a') as f_object:
dictwriter_object = DictWriter(f_object, fieldnames=field_names)
dictwriter_object.writerow(new_dict)
f_object.close()
#open using csv.reader, delete the rows(s).
with open(AlpacaQueriesCSVfile, "r") as f:
reader = csv.reader(f, delimiter=",")
data = list(reader) #should be a better way of doing this by deleting rows in the dictwriter_object above....later
row_count = len(data)
if row_count > 2880:
logger.debug('Deleting row from Queries.csv ')
to_skip = row_count-2880
del data[1:to_skip] # leave first row
with open(QueriesCSVfile, 'w') as f:
write = csv.writer(f)
write.writerows(data)

python csv file add to field based off another field

I have a csv file looks like this:
I have a column called “Inventory”, within that column I pulled data from another source and it put it in a dictionary format as you see.
What I need to do is iterate through the 1000+ lines, if it sees the keywords: comforter, sheets and pillow exist than write “bedding” to the “Location” column for that row, else write “home-fashions” if the if statement is not true.
I have been able to just get it to the if statement to tell me if it goes into bedding or “home-fashions” I just do not know how I tell it to write the corresponding results to the “Location” field for that line.
In my script, im printing just to see my results but in the end I just want to write to the same CSV file.
from csv import DictReader
with open('test.csv', 'r') as read_obj:
csv_dict_reader = DictReader(read_obj)
for line in csv_dict_reader:
if 'comforter' in line['Inventory'] and 'sheets' in line['Inventory'] and 'pillow' in line['Inventory']:
print('Bedding')
print(line['Inventory'])
else:
print('home-fashions')
print(line['Inventory'])
The last column of your csv contains commas. You cannot read it using DictReader.
import re
data = []
with open('test.csv', 'r') as f:
# Get the header row
header = next(f).strip().split(',')
for line in f:
# Parse 4 columns
row = re.findall('([^,]*),([^,]*),([^,]*),(.*)', line)[0]
# Create a dictionary of one row
item = {header[0]: row[0], header[1]: row[1], header[2]: row[2],
header[3]: row[3]}
# Add each row to the list
data.append(item)
After preparing your data, you can check with your conditions.
for item in data:
if all([x in item['Inventory'] for x in ['comforter', 'sheets', 'pillow']]):
item['Location'] = 'Bedding'
else:
item['Location'] = 'home-fashions'
Write output to a file.
import csv
with open('output.csv', 'w') as f:
dict_writer = csv.DictWriter(f, data[0].keys())
dict_writer.writeheader()
dict_writer.writerows(data)
csv.DictReader returns a dict, so just assign the new value to the column:
if 'comforter' in line['Inventory'] and ...:
line['Location'] = 'Bedding'
else:
line['Location'] = 'home-fashions'
print(line['Inventory'])

Create Multiple Python dictionaries from CSV using same column as key

I have a csv:
Col1, Col2, Col3, ...
10, 0.024, 0.0012, ...
20, 0.0013, 0.43, ...
I want a list of dictionaries like so
[{"Col1":"Col2"}, {"Col1": "Col3"},...]
with Col1 always as the key for each dictionary
I've tried this and it works for the first dictionary, but produces empty
dictionaries for all the others.
import os, csv
path = r"I:\ARC\WIP\KevinWIP\Risk\Data\PythonGui"
os.chdir(path)
with open('DispersalKernal10m.csv', mode = 'r') as infile:
reader = csv.reader(infile)
DistProb_LUT = [
{rows[0]:rows[1] for rows in reader},
{rows[0]:rows[2] for rows in reader},
{rows[0]:rows[3] for rows in reader},
{rows[0]:rows[4] for rows in reader},
{rows[0]:rows[5] for rows in reader},
{rows[0]:rows[6] for rows in reader},
{rows[0]:rows[7] for rows in reader}]
infile.close()
print(DistProb_LUT)
Searched around and everything I tried didn't work. Any suggestions appreciated.
For creating the first dictionary itself, you are looping through the entire file and reaching the end of file, so for all other dictionaries your file cursor is always at end and hence all other dictionaries are empty. Instead of doing dictionary comprehension like that, use a for loop outside the dictionary creation portion and change the logic a little bit, like below -
import os, csv
path = r'I:\ARC\WIP\KevinWIP\Risk\Data\PythonGui'
os.chdir(path)
DistProb_LUT = [{} for _ in range(7)]
with open('DispersalKernal10m.csv', mode = 'r') as infile:
reader = csv.reader(infile)
for rows in reader:
for i in range(7):
DistProb_LUT[i][rows[0]] = rows[i+1]
You also do not need to close the infile as it would be automatically closed by with statement.
Reading a file is typically not an operation that you can repeat many times in a row without reopening the file. Therefore, something like this may be of use to you:
import os, csv
path = r'I:\ARC\WIP\KevinWIP\Risk\Data\PythonGui'
os.chdir(path)
DistProb_LUT = [{} for i in range(7)]
with open('DispersalKernal10m.csv', mode = 'r') as infile:
reader = csv.reader(infile)
for row in reader:
DistProb_LUT[0][row[0]] = row[1]
DistProb_LUT[1][row[0]] = row[2]
DistProb_LUT[2][row[0]] = row[3]
DistProb_LUT[3][row[0]] = row[4]
DistProb_LUT[4][row[0]] = row[5]
DistProb_LUT[5][row[0]] = row[6]
DistProb_LUT[6][row[0]] = row[7]
print(DistProb_LUT)
Also, you don't need the infile.close() line. The with statement takes care of that for you.

Compare two CSV files and look for matches Python

I have two CSV files that are like
CSV1
H1,H2,H3
arm,biopsy,forearm
heart,leg biopsy,biopsy
organs.csv
arm
leg
forearm
heart
skin
I need to compare both the files and get an output list like this [arm,forearm,heart,leg] but the script that I'm currently working on doesn't give me any output (I want leg also in the output, though it is mixed with biopsy in the same cell). Here's the code so far. How can I get all the matched words?
import csv
import io
alist, blist = [], []
with open("csv1.csv", "rb") as fileA:
reader = csv.reader(fileA, delimiter=',')
for row in reader:
alist.append(row)
with open("organs.csv", "rb") as fileB:
reader = csv.reader(fileB, delimiter=',')
for row in reader:
blist.append(row)
first_set = set(map(tuple, alist))
secnd_set = set(map(tuple, blist))
matches = set(first_set).intersection(secnd_set)
print matches
Try this:
import csv
alist, blist = [], []
with open("csv1.csv", "rb") as fileA:
reader = csv.reader(fileA, delimiter=',')
for row in reader:
for row_str in row:
alist += row_str.strip().split()
with open("organs.csv", "rb") as fileB:
reader = csv.reader(fileB, delimiter=',')
for row in reader:
blist += row
first_set = set(alist)
second_set = set(blist)
print first_set.intersection(second_set)
Basically, iterating through the csv file via csv reader returns a row which is a list of the items (strings) like this ['arm', 'biopsy', 'forearm'], so you have to sum lists to insert all of the items.
On the other hand, to remove duplications only one set conversion via the set() function is required, and the intersection method returns another set with the elements.
Change the part reading from csv1.csv to:
with open("csv1.csv", "rb") as fileA:
reader = csv.reader(fileA, delimiter=',')
for row in reader:
# append all words in cell
for word in row:
alist.append(word)
I would treat the CSV files as text files, get a lists of all the words in the first and the seconds, then iterate over the first list to see if any exactly match any in the second list.

Merge CSVs in Python with different columns

I have hundreds of large CSV files that I would like to merge into one. However, not all CSV files contain all columns. Therefore, I need to merge files based on column name, not column position.
Just to be clear: in the merged CSV, values should be empty for a cell coming from a line which did not have the column of that cell.
I cannot use the pandas module, because it makes me run out of memory.
Is there a module that can do that, or some easy code?
The csv.DictReader and csv.DictWriter classes should work well (see Python docs). Something like this:
import csv
inputs = ["in1.csv", "in2.csv"] # etc
# First determine the field names from the top line of each input file
# Comment 1 below
fieldnames = []
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
if h not in fieldnames:
fieldnames.append(h)
# Then copy the data
with open("out.csv", "w", newline="") as f_out: # Comment 2 below
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.DictReader(f_in) # Uses the field names in this file
for line in reader:
# Comment 3 below
writer.writerow(line)
Comments from above:
You need to specify all the possible field names in advance to DictWriter, so you need to loop through all your CSV files twice: once to find all the headers, and once to read the data. There is no better solution, because all the headers need to be known before DictWriter can write the first line. This part would be more efficient using sets instead of lists (the in operator on a list is comparatively slow), but it won't make much difference for a few hundred headers. Sets would also lose the deterministic ordering of a list - your columns would come out in a different order each time you ran the code.
The above code is for Python 3, where weird things happen in the CSV module without newline="". Remove this for Python 2.
At this point, line is a dict with the field names as keys, and the column data as values. You can specify what to do with blank or unknown values in the DictReader and DictWriter constructors.
This method should not run out of memory, because it never has the whole file loaded at once.
For those of us using 2.7, this adds an extra linefeed between records in "out.csv". To resolve this, just change the file mode from "w" to "wb".
The solution by #Aaron Lockey, which is the accepted answer has worked well for me except, there were no headers for the file. The out put had no headers and only the row data. Each column was without headings (keys). So I inserted following:
writer.writeheader()
and it worked perfectly fine for me! So now the entire code appears like this:
import csv
inputs = ["in1.csv", "in2.csv"] # etc
# First determine the field names from the top line of each input file
fieldnames = []
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
if h not in fieldnames:
fieldnames.append(h)
# Then copy the data
with open("out.csv", "w", newline="") as f_out:
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
writer.writeheader() #this is the addition.
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.DictReader(f_in) # Uses the field names in this file
for line in reader:
writer.writerow(line)
You can use the pandas module to do this pretty easily. This snippet assumes all your csv files are in the current folder.
import pandas as pd
import os
all_csv = [file_name for file_name in os.listdir(os.getcwd()) if '.csv' in file_name]
li = []
for filename in all_csv:
df = pd.read_csv(filename, index_col=None, header=0, parse_dates=True, infer_datetime_format=True)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('melted_csv.csv', index=False)
I've faced a situation where not only the number of columns are different, but also some column names are missing. For this kind of situation and obviously for your case, this code snippet can help you :)
The tricky part is naming the columns which have no names and adding them to dictionary. The read_csv_file function is playing the main role here.
def read_csv_file(csv_file_path):
headers = []
data = []
with open(csv_file_path, 'r') as f:
csv_reader = csv.reader(f)
rows = []
for i, row in enumerate(csv_reader):
if i == 0:
for j in range(len(row)):
if row[j].strip() == "":
col_name = f"col-{j+1}"
else:
col_name = row[j]
if col_name not in headers:
headers.append(col_name)
else:
rows.append(row)
if len(row) > len(headers):
for j in range(len(row)):
if j+1 > len(headers):
col_name = f"col-{j+1}"
if col_name not in headers:
headers.append(col_name)
for i, row in enumerate(rows):
row_data = {}
for j in range(len(headers)):
if len(row) > j:
row_data[headers[j]] = row[j]
else:
row_data[headers[j]] = ''
data.append(row_data)
return headers, data
def write_csv_file(file_path, rows):
if len(rows) > 0:
headers = list(rows[0].keys())
with open(file_path, 'w', newline='', encoding='UTF8') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(rows)
# The list of the csv file paths which will be merged
files_to_be_merged = [
'file-1.csv',
'file-2.csv',
'file-3.csv'
]
# Read and store all the file data in new_file_data
final_headers = []
new_file_data = []
for f1 in files_to_be_merged:
single_file_data = read_csv_file(f1)
for h in single_file_data[0]:
if h not in final_headers:
final_headers.append(h)
new_file_data += single_file_data[1]
# Add the missing keys to the dictionaries
for d in new_file_data:
for h in final_headers:
if d.get(h) is None:
d[h] = ""
# Write a new file
target_file_name = 'merged_file.csv'
write_csv_file(target_file_name, new_file_data)

Categories

Resources