How to convert multiple json files to cvs files - python

Hello I have multiple json files in a path and I want to convert all of them to csv files separately. Here is what I have tried so far which just convert one json file to a csv file.
with open('/Users/hh/MyDataSet/traceJSON-663-661-A0-25449-7.json') as f:
for line in f:
data.append(json.loads(line))
csv_file=open('/Users/hh/MyDataSet/GTruth/traceJSON-663-661-A0-25449-7.csv','w')
write=csv.writer(csv_file)
# write.writerow(["row number","type","rcvTime","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
# ,"hed_x","hed_y","hed_z"])
write.writerow(["row number","type","rcvTime","sender","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
,"hed_x","hed_y","hed_z"])
for elem in range(len(data)):
if data[elem]['type']==2:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),'663',round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
elif data[elem]['type']==3:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),round(data[elem]['sender'],2),round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
# json_file.close()
print('done!')
csv_file.close()
I appreciate if anyone can help me how can I do it. Also in each json file name "traceJSON-663-661-A0-25449-7", the first number like in the above code (663) should be written in csv file like the following code,if the type is 2:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),'663',....
My json file names are like traceJSON-51-49-A16-25217-7, traceJSON-57-55-A0-25223-7, ....

I suggest using pandas for this:
from glob import glob
import pandas as pd
import os
filepaths = glob('/Users/hh/MyDataSet/*.json') # get list of json files in folder
for f in filepaths:
filename = os.path.basename(f).rsplit('.', 1)[0] # extract filename without extension
nr = int(filename.split('-')[1]) # extract the number from the filename - assuming that all filenames are formatted similarly, use regex otherwise
df = pd.read_json(f) # read the json file as a pandas dataframe, assuming the json file isn't nested
df['type'] = df['type'].replace(2, nr) # replace 2 in 'type' column with the number in the filename
df.to_csv(f'{filename}.csv') # save as csv
If you want to round columns, you can also do this with pandas

import csv
import glob
import json
import os.path
for src_path in glob.glob('/Users/hh/MyDataSet/*.json'):
src_name = os.path.splitext(os.path.basename(src_path))[0]
data = []
with open(src_path) as f:
for line in f:
data.append(json.loads(line))
dest_path = '/Users/hh/MyDataSet/GTruth/' + src_name + '.csv'
csv_file=open(dest_path,'w')
write=csv.writer(csv_file)
write.writerow(["row number","type","rcvTime","sender","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
,"hed_x","hed_y","hed_z"])
for elem in range(len(data)):
if data[elem]['type']==2:
sender = src_name.split('-')[1]
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),sender,round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
elif data[elem]['type']==3:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),round(data[elem]['sender'],2),round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
csv_file.close()
print('done!')

Related

multiple csv files data to single json file

I had two csv files named mortality1 and mortality2 and i want to insert these two csv files data into a single json file...when i am inserting these data, i am unable give the two files at the same time to json file.and this is my code
import csv
import json
import pandas as pd
from glob import glob
csvfile1 = open('C:/Users/DELL/Desktop/data/mortality1.csv', 'r')
csvfile2 = open('C:/Users/DELL/Desktop/data/mortality2.csv', 'r')
jsonfile = open('C:/Users/DELL/Desktop/data/cvstojson.json', 'w')
df = pd.read_csv(csvfile1)
df.to_json(jsonfile)
i want insert the 2 csv files data at the same time to the json file
If both of your csv data are having similar structure, then you can append the data frames to one another, and then convert it to a JSON.
Like
import csv
import json
import pandas as pd
from glob import glob
csvfile1 = open('C:/Users/DELL/Desktop/data/mortality1.csv', 'r')
csvfile2 = open('C:/Users/DELL/Desktop/data/mortality2.csv', 'r')
jsonfile = open('C:/Users/DELL/Desktop/data/cvstojson.json', 'w')
# Read and append both dataframes to single one
df = pd.read_csv(csvfile1).append(pd.read_csv(csvfile2))
# Create the json representation of all rows together.
df.to_json(jsonfile, orient="records")

multiple json to csv using pandas python

trying to convert multiple json files to 1 csv file
tried 2 ways,
first one using pandas ,
second using json and csv writer
about my json
keys are unordered and some keys are different in every file
code using writer
file_list=os.listdir('output')
count = 0
for file in file_list:
dict={}
file_path = "output/" + file
with open(file_path,'r') as f:
jsonData=json.load(f)
datafile=open('data.csv','a')
csv_writer = csv.writer(datafile)
if count == 0:
header = jsonData.keys()
csv_writer.writerow(header)
count += 1
csv_writer.writerow(jsonData.values())
if count == 1:
csv_writer.writerow(jsonData.values())
datafile.close()
problem
bcoz my data is unordered and different keys so in my csv file wrong value is coming under wrong header
code using pandas
for file in file_list:
dict={}
file_path = "output/" + file
with open(file_path,'r') as f:
jsonData=json.load(f)
for j in jsonData:
dict.update({j:[jsonData[j]]})
df=pd.DataFrame(dict)
df.to_csv("hello.csv")
problem
i dont know how to append in pandas
so this is showing only 2 rows bcoz of my last json file i guess
inside my json
Try this code:
import pandas as pd
import json
import pathlib
data_path = pathlib.Path('.')
keys = ['Solutions', 'account_number', 'actual_reading_current','actual_reading_previous', 'address', 'amount_due']
dat = dict([(k, []) for k in keys])
for jfile in data_path.glob('*.json'):
with jfile.open('r') as ifile:
json_data = json.load(ifile)
for key in keys:
dat[key].append(json_data[key][0] if key in json_data else None)
result = pd.DataFrame.from_dict(dat)
result.to_csv('result.csv')
I first define a dictionary containing the columns that I want.
Then I read in the json files and append them as rows to the dictionary.
Note, that I had to edit your json files, one was missing a ending quote and I had to replace the single quotes by double quotes.

How to combine horizontally many CSV files using python csv or pandas module?

Hello!
I would like to combine horizontally many CSV files (the total number will oscillate around 120-150) into one CSV file by adding one column from each file (in this case column called “grid”). All those files have the same columns and number of the rows (they are constructed the same) and are stored in the same catalogue. I’ve tried with CSV module and pandas. I don't want to define all 120 files. I need a script to do it automatically. I’m stuck and I have no ideas...
Some input CSV files (data) and CSV file (merged) which I would like to get:
https://www.dropbox.com/transfer/AAAAAHClI5b6TPzcmW2dmuUBaX9zoSKYD1ZrFV87cFQIn3PARD9oiXQ
That's how my code looks like when I use the CSV module:
import os
import glob
import csv
os.chdir('\csv_files_direction')
extension = 'csv'
files = [i for i in glob.glob('*.{}'.format(extension))]
out_merg = ('\merged_csv_file_direction')
with open(out_merg,'wt') as out:
writer = csv.writer(out)
for file in files:
with open(file) as csvfile:
data = csv.reader(csvfile, delimiter=';')
result = []
for row in data:
a = row[3] #column which I need
result.append(a)
Using this code I receive values only from the last CSV. The rest is missing. As a result I would like to have one precise column from each CSV file from the catalogue.
And Pandas:
import os
import glob
import pandas as pd
import csv
os.chdir('\csv_files_direction')
extension = 'csv'
files = [i for i in glob.glob('*.{}'.format(extension))]
out_merg = ('\merged_csv_file_direction')
in_names = [pd.read_csv(f, delimiter=';', usecols = ['grid']) for f in files]
Using pandas I receive data from all CSV's as the list which can be navigated using e.g in_names[1].
I confess that this is my first try with pandas and I don't have ideas what should be my next step.
I will really appreciate any help!
Thanks in advance,
Mateusz
For the part of CSV i think you need another list define OUTSIDE the loop.
Something like
import os
import sys
dirname = os.path.dirname(os.path.realpath('__file__'))
import glob
import csv
extension = 'csv'
files = [i for i in glob.glob('*.{}'.format(extension))]
out_merg = ('merged_csv_file_direction')
result= []
with open(out_merg,'wt') as out:
writer = csv.writer(out)
for file in files:
with open(file) as csvfile:
data = csv.reader(csvfile, delimiter=';')
col = []
for row in data:
a = row[3] #column which I need
col.append(a)
result.append((col))
NOTE: I have also changed the way to go into the folder. Now you can run the file direcly in the folder that contains the 2 folders (one for take the data and the other to save the data)
Regarding the part of pandas
you can create a loop again. This time you need to CONCAT the dataframes that you have created using in_names = [pd.read_csv(f, delimiter=';', usecols = ['grid']) for f in files]
I think you can use
import os
import glob
import pandas as pd
import csv
os.chdir('\csv_files_direction')
extension = 'csv'
files = [i for i in glob.glob('*.{}'.format(extension))]
out_merg = ('\merged_csv_file_direction')
in_names = [pd.read_csv(f, delimiter=';', usecols = ['grid']) for f in files]
result = pd.concat(in_names)
Tell me if it works

concatenate all files into a directory with python [duplicate]

Guys, I here have 200 separate csv files named from SH (1) to SH (200). I want to merge them into a single csv file. How can I do it?
As ghostdog74 said, but this time with headers:
with open("out.csv", "ab") as fout:
# first file:
with open("sh1.csv", "rb") as f:
fout.writelines(f)
# now the rest:
for num in range(2, 201):
with open("sh"+str(num)+".csv", "rb") as f:
next(f) # skip the header, portably
fout.writelines(f)
Why can't you just sed 1d sh*.csv > merged.csv?
Sometimes you don't even have to use python!
Use accepted StackOverflow answer to create a list of csv files that you want to append and then run this code:
import pandas as pd
combined_csv = pd.concat( [ pd.read_csv(f) for f in filenames ] )
And if you want to export it to a single csv file, use this:
combined_csv.to_csv( "combined_csv.csv", index=False )
fout=open("out.csv","a")
for num in range(1,201):
for line in open("sh"+str(num)+".csv"):
fout.write(line)
fout.close()
I'm just going to throw another code example into the basket:
from glob import glob
with open('singleDataFile.csv', 'a') as singleFile:
for csvFile in glob('*.csv'):
for line in open(csvFile, 'r'):
singleFile.write(line)
It depends what you mean by "merging" -- do they have the same columns? Do they have headers? For example, if they all have the same columns, and no headers, simple concatenation is sufficient (open the destination file for writing, loop over the sources opening each for reading, use shutil.copyfileobj from the open-for-reading source into the open-for-writing destination, close the source, keep looping -- use the with statement to do the closing on your behalf). If they have the same columns, but also headers, you'll need a readline on each source file except the first, after you open it for reading before you copy it into the destination, to skip the headers line.
If the CSV files don't all have the same columns then you need to define in what sense you're "merging" them (like a SQL JOIN? or "horizontally" if they all have the same number of lines? etc, etc) -- it's hard for us to guess what you mean in that case.
Quite easy to combine all files in a directory and merge them
import glob
import csv
# Open result file
with open('output.txt','wb') as fout:
wout = csv.writer(fout,delimiter=',')
interesting_files = glob.glob("*.csv")
h = True
for filename in interesting_files:
print 'Processing',filename
# Open and process file
with open(filename,'rb') as fin:
if h:
h = False
else:
fin.next()#skip header
for line in csv.reader(fin,delimiter=','):
wout.writerow(line)
A slight change to the code above as it does not actually work correctly.
It should be as follows...
from glob import glob
with open('main.csv', 'a') as singleFile:
for csv in glob('*.csv'):
if csv == 'main.csv':
pass
else:
for line in open(csv, 'r'):
singleFile.write(line)
If you are working on linux/mac you can do this.
from subprocess import call
script="cat *.csv>merge.csv"
call(script,shell=True)
If the merged CSV is going to be used in Python then just use glob to get a list of the files to pass to fileinput.input() via the files argument, then use the csv module to read it all in one go.
OR, you could just do
cat sh*.csv > merged.csv
You can simply use the in-built csv library. This solution will work even if some of your CSV files have slightly different column names or headers, unlike the other top-voted answers.
import csv
import glob
filenames = [i for i in glob.glob("SH*.csv")]
header_keys = []
merged_rows = []
for filename in filenames:
with open(filename) as f:
reader = csv.DictReader(f)
merged_rows.extend(list(reader))
header_keys.extend([key for key in reader.fieldnames if key not in header_keys])
with open("combined.csv", "w") as f:
w = csv.DictWriter(f, fieldnames=header_keys)
w.writeheader()
w.writerows(merged_rows)
The merged file will contain all possible columns (header_keys) that can be found in the files. Any absent columns in a file would be rendered as blank / empty (but preserving rest of the file's data).
Note:
This won't work if your CSV files have no headers. In that case you can still use the csv library, but instead of using DictReader & DictWriter, you'll have to work with the basic reader & writer.
This may run into issues when you are dealing with massive data since the entirety of the content is being store in memory (merged_rows list).
Over the solution that made #Adders and later on improved by #varun, I implemented some little improvement too leave the whole merged CSV with only the main header:
from glob import glob
filename = 'main.csv'
with open(filename, 'a') as singleFile:
first_csv = True
for csv in glob('*.csv'):
if csv == filename:
pass
else:
header = True
for line in open(csv, 'r'):
if first_csv and header:
singleFile.write(line)
first_csv = False
header = False
elif header:
header = False
else:
singleFile.write(line)
singleFile.close()
Best regards!!!
You could import csv then loop through all the CSV files reading them into a list. Then write the list back out to disk.
import csv
rows = []
for f in (file1, file2, ...):
reader = csv.reader(open("f", "rb"))
for row in reader:
rows.append(row)
writer = csv.writer(open("some.csv", "wb"))
writer.writerows("\n".join(rows))
The above is not very robust as it has no error handling nor does it close any open files.
This should work whether or not the the individual files have one or more rows of CSV data in them. Also I did not run this code, but it should give you an idea of what to do.
I modified what #wisty said to be worked with python 3.x, for those of you that have encoding problem, also I use os module to avoid of hard coding
import os
def merge_all():
dir = os.chdir('C:\python\data\\')
fout = open("merged_files.csv", "ab")
# first file:
for line in open("file_1.csv",'rb'):
fout.write(line)
# now the rest:
list = os.listdir(dir)
number_files = len(list)
for num in range(2, number_files):
f = open("file_" + str(num) + ".csv", 'rb')
f.__next__() # skip the header
for line in f:
fout.write(line)
f.close() # not really needed
fout.close()
Here is a script:
Concatenating csv files named SH1.csv to SH200.csv
Keeping the headers
import glob
import re
# Looking for filenames like 'SH1.csv' ... 'SH200.csv'
pattern = re.compile("^SH([1-9]|[1-9][0-9]|1[0-9][0-9]|200).csv$")
file_parts = [name for name in glob.glob('*.csv') if pattern.match(name)]
with open("file_merged.csv","wb") as file_merged:
for (i, name) in enumerate(file_parts):
with open(name, "rb") as file_part:
if i != 0:
next(file_part) # skip headers if not first file
file_merged.write(file_part.read())
Updating wisty's answer for python3
fout=open("out.csv","a")
# first file:
for line in open("sh1.csv"):
fout.write(line)
# now the rest:
for num in range(2,201):
f = open("sh"+str(num)+".csv")
next(f) # skip the header
for line in f:
fout.write(line)
f.close() # not really needed
fout.close()
Let's say you have 2 csv files like these:
csv1.csv:
id,name
1,Armin
2,Sven
csv2.csv:
id,place,year
1,Reykjavik,2017
2,Amsterdam,2018
3,Berlin,2019
and you want the result to be like this csv3.csv:
id,name,place,year
1,Armin,Reykjavik,2017
2,Sven,Amsterdam,2018
3,,Berlin,2019
Then you can use the following snippet to do that:
import csv
import pandas as pd
# the file names
f1 = "csv1.csv"
f2 = "csv2.csv"
out_f = "csv3.csv"
# read the files
df1 = pd.read_csv(f1)
df2 = pd.read_csv(f2)
# get the keys
keys1 = list(df1)
keys2 = list(df2)
# merge both files
for idx, row in df2.iterrows():
data = df1[df1['id'] == row['id']]
# if row with such id does not exist, add the whole row
if data.empty:
next_idx = len(df1)
for key in keys2:
df1.at[next_idx, key] = df2.at[idx, key]
# if row with such id exists, add only the missing keys with their values
else:
i = int(data.index[0])
for key in keys2:
if key not in keys1:
df1.at[i, key] = df2.at[idx, key]
# save the merged files
df1.to_csv(out_f, index=False, encoding='utf-8', quotechar="", quoting=csv.QUOTE_NONE)
With the help of a loop you can achieve the same result for multiple files as it is in your case (200 csv files).
If the files aren't numbered in order, take the hassle-free approach below:
Python 3.6 on windows machine:
import pandas as pd
from glob import glob
interesting_files = glob("C:/temp/*.csv") # it grabs all the csv files from the directory you mention here
df_list = []
for filename in sorted(interesting_files):
df_list.append(pd.read_csv(filename))
full_df = pd.concat(df_list)
# save the final file in same/different directory:
full_df.to_csv("C:/temp/merged_pandas.csv", index=False)
An easy-to-use function:
def csv_merge(destination_path, *source_paths):
'''
Merges all csv files on source_paths to destination_path.
:param destination_path: Path of a single csv file, doesn't need to exist
:param source_paths: Paths of csv files to be merged into, needs to exist
:return: None
'''
with open(destination_path,"a") as dest_file:
with open(source_paths[0]) as src_file:
for src_line in src_file.read():
dest_file.write(src_line)
source_paths.pop(0)
for i in range(len(source_paths)):
with open(source_paths[i]) as src_file:
src_file.next()
for src_line in src_file:
dest_file.write(src_line)
import pandas as pd
import os
df = pd.read_csv("e:\\data science\\kaggle assign\\monthly sales\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\Sales_April_2019.csv")
files = [file for file in os.listdir("e:\\data science\\kaggle assign\\monthly sales\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data")
for file in files:
print(file)
all_data = pd.DataFrame()
for file in files:
df=pd.read_csv("e:\\data science\\kaggle assign\\monthly sales\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\"+file)
all_data = pd.concat([all_data,df])
all_data.head()
I have done it by implementing a function that expect output file and paths of the input files.
The function copy the file content of the first file into the output file and then does the same for the rest of input files but without the header line.
def concat_files_with_header(output_file, *paths):
for i, path in enumerate(paths):
with open(path) as input_file:
if i > 0:
next(input_file) # Skip header
output_file.writelines(input_file)
Usage example of the function:
if __name__ == "__main__":
paths = [f"sh{i}.csv" for i in range(1, 201)]
with open("output.csv", "w") as output_file:
concat_files_with_header(output_file, *paths)

how to merge 200 csv files in Python

Guys, I here have 200 separate csv files named from SH (1) to SH (200). I want to merge them into a single csv file. How can I do it?
As ghostdog74 said, but this time with headers:
with open("out.csv", "ab") as fout:
# first file:
with open("sh1.csv", "rb") as f:
fout.writelines(f)
# now the rest:
for num in range(2, 201):
with open("sh"+str(num)+".csv", "rb") as f:
next(f) # skip the header, portably
fout.writelines(f)
Why can't you just sed 1d sh*.csv > merged.csv?
Sometimes you don't even have to use python!
Use accepted StackOverflow answer to create a list of csv files that you want to append and then run this code:
import pandas as pd
combined_csv = pd.concat( [ pd.read_csv(f) for f in filenames ] )
And if you want to export it to a single csv file, use this:
combined_csv.to_csv( "combined_csv.csv", index=False )
fout=open("out.csv","a")
for num in range(1,201):
for line in open("sh"+str(num)+".csv"):
fout.write(line)
fout.close()
I'm just going to throw another code example into the basket:
from glob import glob
with open('singleDataFile.csv', 'a') as singleFile:
for csvFile in glob('*.csv'):
for line in open(csvFile, 'r'):
singleFile.write(line)
It depends what you mean by "merging" -- do they have the same columns? Do they have headers? For example, if they all have the same columns, and no headers, simple concatenation is sufficient (open the destination file for writing, loop over the sources opening each for reading, use shutil.copyfileobj from the open-for-reading source into the open-for-writing destination, close the source, keep looping -- use the with statement to do the closing on your behalf). If they have the same columns, but also headers, you'll need a readline on each source file except the first, after you open it for reading before you copy it into the destination, to skip the headers line.
If the CSV files don't all have the same columns then you need to define in what sense you're "merging" them (like a SQL JOIN? or "horizontally" if they all have the same number of lines? etc, etc) -- it's hard for us to guess what you mean in that case.
Quite easy to combine all files in a directory and merge them
import glob
import csv
# Open result file
with open('output.txt','wb') as fout:
wout = csv.writer(fout,delimiter=',')
interesting_files = glob.glob("*.csv")
h = True
for filename in interesting_files:
print 'Processing',filename
# Open and process file
with open(filename,'rb') as fin:
if h:
h = False
else:
fin.next()#skip header
for line in csv.reader(fin,delimiter=','):
wout.writerow(line)
A slight change to the code above as it does not actually work correctly.
It should be as follows...
from glob import glob
with open('main.csv', 'a') as singleFile:
for csv in glob('*.csv'):
if csv == 'main.csv':
pass
else:
for line in open(csv, 'r'):
singleFile.write(line)
If you are working on linux/mac you can do this.
from subprocess import call
script="cat *.csv>merge.csv"
call(script,shell=True)
If the merged CSV is going to be used in Python then just use glob to get a list of the files to pass to fileinput.input() via the files argument, then use the csv module to read it all in one go.
OR, you could just do
cat sh*.csv > merged.csv
You can simply use the in-built csv library. This solution will work even if some of your CSV files have slightly different column names or headers, unlike the other top-voted answers.
import csv
import glob
filenames = [i for i in glob.glob("SH*.csv")]
header_keys = []
merged_rows = []
for filename in filenames:
with open(filename) as f:
reader = csv.DictReader(f)
merged_rows.extend(list(reader))
header_keys.extend([key for key in reader.fieldnames if key not in header_keys])
with open("combined.csv", "w") as f:
w = csv.DictWriter(f, fieldnames=header_keys)
w.writeheader()
w.writerows(merged_rows)
The merged file will contain all possible columns (header_keys) that can be found in the files. Any absent columns in a file would be rendered as blank / empty (but preserving rest of the file's data).
Note:
This won't work if your CSV files have no headers. In that case you can still use the csv library, but instead of using DictReader & DictWriter, you'll have to work with the basic reader & writer.
This may run into issues when you are dealing with massive data since the entirety of the content is being store in memory (merged_rows list).
Over the solution that made #Adders and later on improved by #varun, I implemented some little improvement too leave the whole merged CSV with only the main header:
from glob import glob
filename = 'main.csv'
with open(filename, 'a') as singleFile:
first_csv = True
for csv in glob('*.csv'):
if csv == filename:
pass
else:
header = True
for line in open(csv, 'r'):
if first_csv and header:
singleFile.write(line)
first_csv = False
header = False
elif header:
header = False
else:
singleFile.write(line)
singleFile.close()
Best regards!!!
You could import csv then loop through all the CSV files reading them into a list. Then write the list back out to disk.
import csv
rows = []
for f in (file1, file2, ...):
reader = csv.reader(open("f", "rb"))
for row in reader:
rows.append(row)
writer = csv.writer(open("some.csv", "wb"))
writer.writerows("\n".join(rows))
The above is not very robust as it has no error handling nor does it close any open files.
This should work whether or not the the individual files have one or more rows of CSV data in them. Also I did not run this code, but it should give you an idea of what to do.
I modified what #wisty said to be worked with python 3.x, for those of you that have encoding problem, also I use os module to avoid of hard coding
import os
def merge_all():
dir = os.chdir('C:\python\data\\')
fout = open("merged_files.csv", "ab")
# first file:
for line in open("file_1.csv",'rb'):
fout.write(line)
# now the rest:
list = os.listdir(dir)
number_files = len(list)
for num in range(2, number_files):
f = open("file_" + str(num) + ".csv", 'rb')
f.__next__() # skip the header
for line in f:
fout.write(line)
f.close() # not really needed
fout.close()
Here is a script:
Concatenating csv files named SH1.csv to SH200.csv
Keeping the headers
import glob
import re
# Looking for filenames like 'SH1.csv' ... 'SH200.csv'
pattern = re.compile("^SH([1-9]|[1-9][0-9]|1[0-9][0-9]|200).csv$")
file_parts = [name for name in glob.glob('*.csv') if pattern.match(name)]
with open("file_merged.csv","wb") as file_merged:
for (i, name) in enumerate(file_parts):
with open(name, "rb") as file_part:
if i != 0:
next(file_part) # skip headers if not first file
file_merged.write(file_part.read())
Updating wisty's answer for python3
fout=open("out.csv","a")
# first file:
for line in open("sh1.csv"):
fout.write(line)
# now the rest:
for num in range(2,201):
f = open("sh"+str(num)+".csv")
next(f) # skip the header
for line in f:
fout.write(line)
f.close() # not really needed
fout.close()
Let's say you have 2 csv files like these:
csv1.csv:
id,name
1,Armin
2,Sven
csv2.csv:
id,place,year
1,Reykjavik,2017
2,Amsterdam,2018
3,Berlin,2019
and you want the result to be like this csv3.csv:
id,name,place,year
1,Armin,Reykjavik,2017
2,Sven,Amsterdam,2018
3,,Berlin,2019
Then you can use the following snippet to do that:
import csv
import pandas as pd
# the file names
f1 = "csv1.csv"
f2 = "csv2.csv"
out_f = "csv3.csv"
# read the files
df1 = pd.read_csv(f1)
df2 = pd.read_csv(f2)
# get the keys
keys1 = list(df1)
keys2 = list(df2)
# merge both files
for idx, row in df2.iterrows():
data = df1[df1['id'] == row['id']]
# if row with such id does not exist, add the whole row
if data.empty:
next_idx = len(df1)
for key in keys2:
df1.at[next_idx, key] = df2.at[idx, key]
# if row with such id exists, add only the missing keys with their values
else:
i = int(data.index[0])
for key in keys2:
if key not in keys1:
df1.at[i, key] = df2.at[idx, key]
# save the merged files
df1.to_csv(out_f, index=False, encoding='utf-8', quotechar="", quoting=csv.QUOTE_NONE)
With the help of a loop you can achieve the same result for multiple files as it is in your case (200 csv files).
If the files aren't numbered in order, take the hassle-free approach below:
Python 3.6 on windows machine:
import pandas as pd
from glob import glob
interesting_files = glob("C:/temp/*.csv") # it grabs all the csv files from the directory you mention here
df_list = []
for filename in sorted(interesting_files):
df_list.append(pd.read_csv(filename))
full_df = pd.concat(df_list)
# save the final file in same/different directory:
full_df.to_csv("C:/temp/merged_pandas.csv", index=False)
An easy-to-use function:
def csv_merge(destination_path, *source_paths):
'''
Merges all csv files on source_paths to destination_path.
:param destination_path: Path of a single csv file, doesn't need to exist
:param source_paths: Paths of csv files to be merged into, needs to exist
:return: None
'''
with open(destination_path,"a") as dest_file:
with open(source_paths[0]) as src_file:
for src_line in src_file.read():
dest_file.write(src_line)
source_paths.pop(0)
for i in range(len(source_paths)):
with open(source_paths[i]) as src_file:
src_file.next()
for src_line in src_file:
dest_file.write(src_line)
import pandas as pd
import os
df = pd.read_csv("e:\\data science\\kaggle assign\\monthly sales\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\Sales_April_2019.csv")
files = [file for file in os.listdir("e:\\data science\\kaggle assign\\monthly sales\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data")
for file in files:
print(file)
all_data = pd.DataFrame()
for file in files:
df=pd.read_csv("e:\\data science\\kaggle assign\\monthly sales\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\"+file)
all_data = pd.concat([all_data,df])
all_data.head()
I have done it by implementing a function that expect output file and paths of the input files.
The function copy the file content of the first file into the output file and then does the same for the rest of input files but without the header line.
def concat_files_with_header(output_file, *paths):
for i, path in enumerate(paths):
with open(path) as input_file:
if i > 0:
next(input_file) # Skip header
output_file.writelines(input_file)
Usage example of the function:
if __name__ == "__main__":
paths = [f"sh{i}.csv" for i in range(1, 201)]
with open("output.csv", "w") as output_file:
concat_files_with_header(output_file, *paths)

Categories

Resources