Generate multiple files from single file based on column values - python

I have just started learning python, trying to use it for one of my manual activity which i perform using excel filter operator.
Every month i receive a file, i put that csv into an excel then applying filter create a new file for value in carrier field and share that with respective carrier.
here is some sample data from my csv. I have shown only 2 carriers here but i have more than 13 values,
carrier,type,count
DTH,a,123
DTH,b,3123
DTH,c,41341
DTH,d,13411
BLUEDART,a,12123
BLUEDART,b,31231
BLUEDART,c,411
BLUEDART,d,11
Expected output
DTH.csv
carrier,type,count
DTH,a,123
DTH,b,3123
DTH,c,41341
DTH,d,13411
BLUEDART.csv
carrier,type,count
BLUEDART,a,12123
BLUEDART,b,31231
BLUEDART,c,411
BLUEDART,d,11
Any help or just guidance is highly appreciated.

Very easy using pandas:
import pandas as pd
carriers_csv_path = r"C:\Users\Bluetab\PycharmProjects\utils\csvGeneratorStack\csvCarriers.csv"
carrier_df = pd.read_csv(carriers_csv_path)
grouped_by_carrier = carrier_df.groupby(["carrier"])
unique_keys = carrier_df['carrier'].unique()
for unique_key in unique_keys:
grouped_by_carrier.get_group(unique_key).to_csv("./" + unique_key + ".csv", sep=",", index=False)
Hope it helps.
Tomas

Using the standard library of Python only:
import csv
def write_output(header_row, carrier_name, c_rows):
print("writing output for "+carrier_name)
with open("c:\\tmp\\"+carrier_name+".csv", "w", newline="") as outfile:
outwriter = csv.writer(outfile, delimiter=",")
outwriter.writerow(header_row)
for outrow in c_rows:
outwriter.writerow(outrow)
with open("c:\\tmp\\carrier.csv", newline="") as csvfile:
creader = csv.reader(csvfile, delimiter=",")
first_row = True
header_row = None
groups = {}
for row in creader:
if first_row:
header_row = row
first_row = False
else:
if not row[0] in groups:
groups[row[0]] = [row]
else:
groups[row[0]].append(row)
for gr in groups:
write_output(header_row, gr, groups[gr])

Related

Split values in CSV that look like JSON

So I have a CSV file with a column called content. However, the contents in column look like it is based on JSON, and, therefore, house more columns. I would like to split these contents into multiple columns or extract the final part of it after "value". See picture below to see an example of the file. Any ideas how to get this? I would prefer using Python. I don't have any experience with JSON.
Using pandas you could do in a simpler way.
EDIT updated to handle the single quotes:
import pandas as pd
import json
data = pd.read_csv('test.csv', delimiter="\n")["content"]
res = [json.loads(row.replace("'", '"')) for row in data]
result = pd.DataFrame(res)
result.head()
# Export result to CSV
result.to_csv("result.csv")
my csv:
result:
This script will create a new csv file with the 'value' added to the csv as an additional column
(make sure that the input_csv and output_csv are different filenames)
import csv
import json
input_csv = "data.csv"
output_csv = "data_updated.csv"
values = []
with open(input_csv) as f_in:
dr = csv.DictReader(f_in)
for row in dr:
value = json.loads(row["content"].replace("'", '"'))["value"]
values.append(value)
with open(input_csv) as f_in:
with open(output_csv, "w+") as f_out:
w = csv.writer(f_out, lineterminator="\n")
r = csv.reader(f_in)
all = []
row = next(r)
row.append("value")
all.append(row)
i = 0
for row in r:
row.append(values[i])
all.append(row)
i += 1
w.writerows(all)

file handling python - vlookup

source.csv as follows.
AB;CD
a;1;
b;2;
c;3;
target.csv as follows.
DE;FG;HI
1;e;1;
2;a;2;
3;f;3;
I need to do a vlookup using file handling mechanisms in python.
So need to update column 'FG' of 'target.csv' by looking up the column 'AB' of 'source.csv' and update with 'CD' column value of 'source.csv'.
So my desired output is like below.
DE;FG;HI
1;e;1;
2;1;2; #a is replaced with 1
3;f;3;
Without using pandas or any other method how I can approach this.
Below is how I approached this.
with open('D:/target.csv', "w+", encoding="utf-8") as Tgt_csvFile:
with open('D:/source.csv', "r", encoding="utf-8") as Src_csvFile:
for line in Src_csvFile:
fields = line.split(";")
x = fields[0]
for line_1 in Tgt_csvFile:
fields_1 = line_1.split(";")
y = fields[1]
if y == x:
update # not sure how to do this
else:
keep as it is
Appreciate on the support
This will solve your particular problem, but if the number of input/output columns changes you will need to adjust the logic accordingly.
It's also worth noting the trailing ; on each non-header row of your csv file will cause most packages to assume there is an extra column. I don't think you want that.
# Read in input, creating a dict where key is column 1 and value is column 2
with open('source.csv', mode='r') as infile:
reader = csv.reader(infile, delimiter=';')
s = {x[0]:x[1] for x in reader}
print(s)
# If column 2 is a key in dict s update with value from dict
output = []
with open('target.csv', mode='r') as infile:
reader = csv.reader(infile, delimiter=';')
for row in reader:
if row[1] in s.keys():
row[1] = s[row[1]]
output.append(row)
# Output to csv
with open('output.csv', mode='w', newline='') as outfile:
writer = csv.writer(outfile, delimiter=';')
writer.writerows(output)
Here is my suggestion:
with open('D:/source.csv', "r", encoding="utf-8") as Src_csvFile:
l=Src_csvFile.readlines()
d={}
for i in l[1:]:
x=i.split(';')
d[x[0]]=x[1]
with open('D:/target.csv', "r", encoding="utf-8") as Tgt_csvFile:
m=Tgt_csvFile.readlines()
for i in range(1,len(m)):
x=m[i].split(';')
if x[1] in d:
x[1]=d.get(x[1])
m[i]=';'.join(x)
with open('D:/target.csv', "w", encoding="utf-8") as Tgt_csvFile:
Tgt_csvFile.writelines(m)
Output:
DE;FG;HI
1;e;1;
2;1;2;
3;f;3;

Merge CSVs in Python with different columns

I have hundreds of large CSV files that I would like to merge into one. However, not all CSV files contain all columns. Therefore, I need to merge files based on column name, not column position.
Just to be clear: in the merged CSV, values should be empty for a cell coming from a line which did not have the column of that cell.
I cannot use the pandas module, because it makes me run out of memory.
Is there a module that can do that, or some easy code?
The csv.DictReader and csv.DictWriter classes should work well (see Python docs). Something like this:
import csv
inputs = ["in1.csv", "in2.csv"] # etc
# First determine the field names from the top line of each input file
# Comment 1 below
fieldnames = []
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
if h not in fieldnames:
fieldnames.append(h)
# Then copy the data
with open("out.csv", "w", newline="") as f_out: # Comment 2 below
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.DictReader(f_in) # Uses the field names in this file
for line in reader:
# Comment 3 below
writer.writerow(line)
Comments from above:
You need to specify all the possible field names in advance to DictWriter, so you need to loop through all your CSV files twice: once to find all the headers, and once to read the data. There is no better solution, because all the headers need to be known before DictWriter can write the first line. This part would be more efficient using sets instead of lists (the in operator on a list is comparatively slow), but it won't make much difference for a few hundred headers. Sets would also lose the deterministic ordering of a list - your columns would come out in a different order each time you ran the code.
The above code is for Python 3, where weird things happen in the CSV module without newline="". Remove this for Python 2.
At this point, line is a dict with the field names as keys, and the column data as values. You can specify what to do with blank or unknown values in the DictReader and DictWriter constructors.
This method should not run out of memory, because it never has the whole file loaded at once.
For those of us using 2.7, this adds an extra linefeed between records in "out.csv". To resolve this, just change the file mode from "w" to "wb".
The solution by #Aaron Lockey, which is the accepted answer has worked well for me except, there were no headers for the file. The out put had no headers and only the row data. Each column was without headings (keys). So I inserted following:
writer.writeheader()
and it worked perfectly fine for me! So now the entire code appears like this:
import csv
inputs = ["in1.csv", "in2.csv"] # etc
# First determine the field names from the top line of each input file
fieldnames = []
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
if h not in fieldnames:
fieldnames.append(h)
# Then copy the data
with open("out.csv", "w", newline="") as f_out:
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
writer.writeheader() #this is the addition.
for filename in inputs:
with open(filename, "r", newline="") as f_in:
reader = csv.DictReader(f_in) # Uses the field names in this file
for line in reader:
writer.writerow(line)
You can use the pandas module to do this pretty easily. This snippet assumes all your csv files are in the current folder.
import pandas as pd
import os
all_csv = [file_name for file_name in os.listdir(os.getcwd()) if '.csv' in file_name]
li = []
for filename in all_csv:
df = pd.read_csv(filename, index_col=None, header=0, parse_dates=True, infer_datetime_format=True)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('melted_csv.csv', index=False)
I've faced a situation where not only the number of columns are different, but also some column names are missing. For this kind of situation and obviously for your case, this code snippet can help you :)
The tricky part is naming the columns which have no names and adding them to dictionary. The read_csv_file function is playing the main role here.
def read_csv_file(csv_file_path):
headers = []
data = []
with open(csv_file_path, 'r') as f:
csv_reader = csv.reader(f)
rows = []
for i, row in enumerate(csv_reader):
if i == 0:
for j in range(len(row)):
if row[j].strip() == "":
col_name = f"col-{j+1}"
else:
col_name = row[j]
if col_name not in headers:
headers.append(col_name)
else:
rows.append(row)
if len(row) > len(headers):
for j in range(len(row)):
if j+1 > len(headers):
col_name = f"col-{j+1}"
if col_name not in headers:
headers.append(col_name)
for i, row in enumerate(rows):
row_data = {}
for j in range(len(headers)):
if len(row) > j:
row_data[headers[j]] = row[j]
else:
row_data[headers[j]] = ''
data.append(row_data)
return headers, data
def write_csv_file(file_path, rows):
if len(rows) > 0:
headers = list(rows[0].keys())
with open(file_path, 'w', newline='', encoding='UTF8') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(rows)
# The list of the csv file paths which will be merged
files_to_be_merged = [
'file-1.csv',
'file-2.csv',
'file-3.csv'
]
# Read and store all the file data in new_file_data
final_headers = []
new_file_data = []
for f1 in files_to_be_merged:
single_file_data = read_csv_file(f1)
for h in single_file_data[0]:
if h not in final_headers:
final_headers.append(h)
new_file_data += single_file_data[1]
# Add the missing keys to the dictionaries
for d in new_file_data:
for h in final_headers:
if d.get(h) is None:
d[h] = ""
# Write a new file
target_file_name = 'merged_file.csv'
write_csv_file(target_file_name, new_file_data)

How to merge vertically several csv files in Python?

I need to merge vertically the data from several CSV spreadsheets in Python. Their structure is identical, I just need to put one table's data on top of the following because they are the months on an annual survey. I tried several methods I found googling but I can't find a way to do something as simple as:
import csv
spreadsheets1 = open('0113_RE_fscom.csv','r')
spreadsheets2 = open('0213_RE_fscom.csv','r')
spreadsheets = spreadsheets1 + spreadsheets2
with spreadsheet as csvfile:
sales = csv.reader(csvfile)
for row in sales:
print row
Looks like you simply forgot to iterate over files. Try this code:
import csv
spreadsheet_filenames = [
'0113_RE_fscom.csv',
'0213_RE_fscom.csv',
]
for filename in spreadsheet_filenames:
with open(filename, 'r') as csvfile:
sales = csv.reader(csvfile)
for row in sales:
print row
how about this:
import csv
from itertools import izip
with open('0113_RE_fscom.csv', 'r') as f1, open('0113_RE_fscom.csv', 'r') as f2:
csv1 = csv.reader(f1, delimiter=',')
csv2 = csv.reader(f2, delimiter=',')
for line1, line2 in izip(csv1, csv2):
print line1 + line2
This is quite simple with pandas.
import pandas as pd
f1 = pd.read_csv('0113_RE_fscom.csv', header=None)
f2 = pd.read_csv('0213_RE_fscom.csv', header=None)
merged = pd.concat(f1, f2)
merged.to_csv('merged.csv', index=None, header=None)
Remove header=None if your files actually do have a header.

How to perform a simple calculation in a CSV and append the results to the file

I have a csv which contains 38 colums of data, all I want to find our how to do is, divide column 11 by column by column 38 and append this data tot he end of each row. Missing out the title row of the csv (row 1.)
If I am able to get a snippet of code that can do this, I will be able to manipulate the same code to perform lots of similar functions.
My attempt involved editing some code that was designed for something else.
See below:
from collections import defaultdict
class_col = 11
data_col = 38
# Read in the data
with open('test.csv', 'r') as f:
# if you have a header on the file
# header = f.readline().strip().split(',')
data = [line.strip().split(',') for line in f]
# Append the relevant sum to the end of each row
for row in xrange(len(data)):
data[row].append(int(class_col)/int(data_col))
# Write the results to a new csv file
with open('testMODIFIED2.csv', 'w') as nf:
nf.write('\n'.join(','.join(row) for row in data))
Any help will be greatly appreciated. Thanks SMNALLY
import csv
with open('test.csv', 'rb') as old_csv:
csv_reader = csv.reader(old_csv)
with open('testMODIFIED2.csv', 'wb') as new_csv:
csv_writer = csv.writer(new_csv)
for i, row in enumerate(csv_reader):
if i != 0:
row.append(float(row[10]) / float(row[37]))
csv_writer.writerow(row)
Use pandas:
import pandas
df = pandas.read_csv('test.csv') #assumes header row exists
df['FRACTION'] = 1.0*df['CLASS']/df['DATA'] #by default new columns are appended to the end
df.to_csv('out.csv')

Categories

Resources