Write to multiple csv using glob - python

I am working with lots of csv files and need to add column. I tried glob, for example:
import glob
filenames = sorted(glob.glob('./DATA1/*2018*.csv'))
filenames = filenames[0:10]
import numpy as np
import pandas as pd
for f in filenames:
df = pd.read_csv(f, header=None, index_col=None)
df.columns = ['Date','Signal','Data','Code']
#this is what I should add to all csv files
df["ID"] = df["Data"].str.slice(0,2)
and I need a way to save the file back to csv (not concatenated) with different name such as "file01edited.csv" after I add the column to each csv file.

Use to_csv with f-strings for change file names:
for f in filenames:
df = pd.read_csv(f, names=['Date','Signal','Data','Code'], index_col=None)
#this is what I should add to all csv files
df["ID"] = df["Data"].str.slice(0,2)
#python 3.6+
df.to_csv(f'{f[:-4]}edited.csv', index=False)
#python bellow 3.6
#df.to_csv('{}edited.csv'.format(f[:-4]), index=False)

Related

Exporting Pandas output for multiple CSV files

I have many CSV files under subdirectories in one folder. They all contain tweets and other metadata. I am interested in removing most of these metadata and keeping the tweets themselves and their time. I used glob to read the files, and the removing part seems to be working fine. However, I am not sure how to save the output so that all files are saved and with their original file name.
import pandas as pd
import glob
path = r'D:\tweets'
myfiles= glob.glob(r'D:\tweets\**\*.csv', recursive=True)
for f in myfiles:
df = pd.read_csv(f)
df = df.drop(["name", "id","conversation_id","created_at","date"], axis=1)
df = df[df["language"].str.contains("bn|ca|ckbu|id||zh")==False]
df.to_csv("output_filename.csv", index=False, encoding='utf8')
If you do it this way, it will overwrite the same file:
for f in myfiles:
df = pd.read_csv(f)
df = df.drop(["name", "id","conversation_id","created_at","date"], axis=1)
df = df[df["language"].str.contains("bn|ca|ckbu|id||zh")==False]
df.to_csv(f, index=False, encoding='utf8')

Merge multple csv files to one and hide the header

I'm trying to merge multiple csv to one bigfile.
The script is working but I would like to have only the first header, and not one for each csv within bigfile.
How could I do it, shouldn't work with header=None?
import os
import glob
import pandas
def concatenate(inDir = r'myPath', outFile = r"outPath"):
os.chdir(inDir)
fileList = glob.glob("*.csv") #generate a list of csv files using the glob method
dfList = []
for filename in fileList:
print (filename)
df = pandas.read_csv(filename, header=None)
dfList.append(df)
concatDf = pandas.concat(dfList, axis=0)
concatDf.to_csv(outfile, index=None) # export the dataframe to a csv file

Adding dataframe column names based on filename after merging using Glob

I have Excel files in a folder, all in the same format with data for all countries in the world in the sheet 'Dataset2' in each file.
I have merged all files together into one using glob, but I need to know which file (i.e. which country) each column comes from.
Is there a way to do this?
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined = pd.concat([pd.read_excel(f, sheet_name='Dataset2') for f in all_filenames ],axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
You could unpack the list comprehension into a for-loop and add an additional column to each data file, something like this:
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
file_list = []
for f in all_filenames:
data = pd.read_excel(f, sheet_name='Dataset2')
data['source_file'] = f # create a column with the name of the file
file_list.append(data)
combined = pd.concat(file_list, axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
if you're using os module try path.basename and adding this to the key argument in concat:
import glob
import os
import pandas as pd
os.chdir(r"C:\Users\Umar.Hussain\OneDrive - Ricoh Europe PLC\Documents\Excels")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
names = [os.path.basename(f) for f in all_filenames]
combined = pd.concat([pd.read_excel(f, sheet_name='Sheet1') for f in all_filenames],keys=names,axis=1 )
as your using axis=1 this will add the keys to the header, so may want to read the excels first and add it to a list like :
dfs = []
for file in all_filenames:
df = pd.read_excel(file)
df['source'] = os.path.basename(file)
dfs.append(df)

Reading and Passing Excel Filename with Pandas

I want to read excel file with Pandas, delete the header row and the first column and write the resultant data in an excel file with the same name. I want to do it for all the excel files in a folder. I have written the code for data reading and writing but having trouble with saving the data in a file with the same name. The code I have written is like this-
import numpy as np
import pandas as pd
import os
for filename in os.listdir ('./'):
if filename.endswith ('.xlsx'):
df = pd.read_excel ('new.xlsx', skiprows=1)
df.drop (df.columns [0], axis=1, inplace=True)
df.to_csv ('new.csv', index=False)
How can I automate my code for all the excel files in the same folder?
Use variable filename in function read_excel and then create new file names by format and for remove first column is possible use DataFrame.iloc - select all columns without first:
for filename in os.listdir ('./'):
if filename.endswith ('.xlsx'):
df = pd.read_excel (filename, skiprows=1)
df.iloc[:, 1:].to_csv('new_{}.csv'.format(filename), index=False)
Another solution with glob, there is possible specify extensions:
import glob
for filename in glob.glob('./*.xlsx'):
df = pd.read_excel (filename, skiprows=1)
df.iloc[:, 1:].to_csv('new_{}.csv'.format(filename), index=False)
#python 3.6+
#df.iloc[:, 1:].to_csv (f'new_{filename}.csv', index=False)
Try Below for reading multiple files as follows:
import pandas as pd
import glob
# Read multiple files into one dataframe along with pandas `concat`
# if you have path defined like `/home/data/` then you can use `/home/data/*.xlsx` otherwise you directly mention the path.
df = pd.concat([pd.read_excel(files, sep=',', index=False, skiprows=1) for files in glob.glob("/home/data/*.xlsx")])
Alternative:
Read multiple files into one dataframe
all_Files = glob.glob('/home/data/*.xlsx')
df = pd.concat((pd.read_excel(files, sep=',', index=False, skiprows=1) for files in all_Files))

Reading multiple csv files and writing it to another csv file

I have this code
import pandas as p
import csv
df = p.read_csv('interview1.csv')
df2 = df[['Participant', 'Translation']] # selects two of the columns in your file
df2.to_csv('out.csv')
How do i read multiple files and then write to 'out.csv'. So basically, instead of reading only interview1, i read interview2, interview3 to interview7 as well into the out.csv
Simply open the output file in append mode:
import pandas as p
import csv
csv_list=['interview1.csv', 'interview2.csv', ...]
for itw in csv_list:
df = p.read_csv(itw)
df.to_csv('out.csv', mode='a')
Use this to read all .CSV data from a folder and combined it together
import pandas as pd
import glob
import os
path = r'file path'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df.to_csv("combined-data_new.csv")

Categories

Resources