Each folder has a csv for each month of the year(1.csv,2.csv,3.csv etc) and the script creates a dataframe combining the 9th column for all 12 csv's into an xlsx sheet named concentrated.xlsx. It works but only for one directory at a time
files = glob['2014/*.csv']
sorted_files = natsorted(files)
def read_9th(fn):
return pd.read_csv(fn, usecols=[9], names=headers)
big_df = pd.concat([read_9th(fn) for fn in sorted_files], axis=1)
writer = pd.ExcelWriter('concentrated.xlsx', engine='openpyxl')
big_df.to_excel(writer,'2014')
writer.save()
Is it possible to create a dataframe automatically for each directory without having to manually create one for each folder like this:
files14 = glob['2014/*.csv']
files15 = glob['2015/*.csv']
sorted_files14 = natsorted(files14)
sorted_files15 = natsorted(files15)
def read_9th(fn):
return pd.read_csv(fn, usecols=[9], names=headers)
big_df = pd.concat([read_9th(fn) for fn in sorted_files14], axis=1)
big_df1 = pd.concat([read_9th(fn) for fn in sorted_files15], axis=1)
writer = pd.ExcelWriter('concentrated.xlsx', engine='openpyxl')
big_df.to_excel(writer,'2014')
big_df1.to_excel(writer,'2015')
writer.save()
If you get a list of the folders that want to process, e.g.
folders = os.listdir('.')
# or
folders = ['2014', '2015', '2016']
You could do something like:
writer = pd.ExcelWriter('concentrated.xlsx', engine='openpyxl')
for folder in folders:
files = glob('%s/*.csv' % folder)
sorted_files = natsorted(files)
big_df = pd.concat([read_9th(fn) for fn in sorted_files], axis=1)
big_df.to_excel(writer, folder)
writer.save()
Related
I am trying to merge differents csv in Python. The files are in the same folder. All files have one column in common 'client_ID'. I tried this code:
path= r'/folder_path/'
allfiles = glob.glob(path + "/*.csv")
df = pd.DataFrame()
for file in allfiles:
df_file = pd.read_csv(file)
df_file = pd.merge(df, df_file, on='partner_id')
df
You can read the first csv file first so that you don't start with an empty dataframe. I would edit your code like this:
path= r'/folder_path/'
allfiles = glob.glob(path + "/*.csv")
for i, file in enumerate(allfiles):
if i < 1:
df = pd.read_csv(file)
else:
df_file = pd.read_csv(file)
df = pd.merge(df, df_file, on='partner_id')
df
I am trying to parse a list of .txt files within a zip folder but it's only parsing one file from that list
Code:
def custom_parse(self, response):
self.logger.info(response.url)
links = response.xpath("//a[contains(#href, '.zip')]/#href").getall()
for link in list(set(links)):
print(link)
local_path = self.download_file("https://www.sec.gov" + link)
zip_file = zipfile.ZipFile(local_path)
zip_csv_files = [file_name for file_name in zip_file.namelist() if file_name.endswith(".txt") and "pre" not in file_name]
zip_csv_file = zip_csv_files[0]
with zip_file.open(zip_csv_file, "r") as zip:
# df = pd.read_csv(BytesIO(zip.read()), dtype=object)
df = pd.read_csv(zip, dtype=object, header=None, sep='delimiter')
df = self.standardized(df)
for k, row in df.iterrows():
yield dict(row)
def standardized(self, df):
# df.columns = [col.lower().strip().replace(" ", "_") for col in df.columns]
df = df.fillna('')
return df
I am going to assume it's due to zip_csv_file = zip_csv_files[0] but I am unsure how I can modify my current code to parse all the .txt files in a given zip folder.
You already pull out all the .txt files with your list comprehension, so just read those in a loop and concatenate them. This is untested, but should be close
replace the appropriate section of your code with this:
UPDATE:
zip_file = zipfile.ZipFile(local_path)
text_files = zip_file.infolist()
df_list =[]
for file_name in text_files:
if file_name.filename.endswith(".txt") and "pre" not in file_name.filename:
df_list.append(pd.read_csv(zip_file(open(file_name.filename)), dtype=object, header=None, sep='delimiter'))
df = pd.concat(df_list)
df = self.standardized(df)
I am trying to create 3 different dataframes to output in my excel file in 3 separate worksheet called df, df_OK, df_KO. However the code below only outputs df and is not creating the other 2 dataframes df_OK, df_KO to have in the same Excel file but in 2 separate worksheets.
Any suggestions? Thanks
class blah:
def __init__(self, path, file_in, file_out):
self.path = path
self.file_in = file_in
self.file_out = file_out
def process_file(self):
df = pd.read_excel(self.path + self.file_in)
df_OK = df.loc[df['Status'] == 'OK']
df_KO = df.loc[df['Status'] == 'KO']
df_OK.loc['Total'] = df_OK[['Price']].sum(axis=0)
writer = pd.ExcelWriter(self.path + self.file_out, engine='xlsxwriter')
dfs = {
'All': df,
'OK': df_OK,
'KO': df_KO
}
for sheet_name in dfs.keys():
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
b = blah('C:/Users/......./',
'path...',
'file_in....',
'file_out...')
b.process_file()
It is because you overwrite the same Excel file in every iteration of your for sheet_name in dfs.keys() loop. So every time you write an Excel file with only a single sheet to the same filename, thus overwriting the previous document.
You should move the writer.save() outside your loop like so:
for sheet_name in dfs.keys():
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
I am using Ubuntu 16.0.4. After reading from an excel file, I am trying to add multiple excel sheet to a pdf file.
df = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df = df.dropna(axis = 1, how='all')
df = df.dropna(how='all')
df.to_html("file.html")
pdf_name = name_of_file + '.pdf'
pdfkit.from_file("file.html", pdf_name)
How can I add another excel sheet from the same excel file to the same pdf file without overwriting the previous sheet that is in the pdf?
Thanks!
If the two sheets have the same data structure (columns and etc.):
df1 = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df2 = pd.read_excel(excel_name, sheet_name = 'Sheet2')
df = df1.append(df2)
If not:
df1 = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df2 = pd.read_excel(excel_name, sheet_name = 'Sheet2')
# Do whatever you need to transform the dfs
html_str = '<br />'.join([df1.to_html(), df2.to_html()])
with open("file.html", "w") as text_file:
text_file.write(html_str)
pdf_name = name_of_file + '.pdf'
pdfkit.from_file("file.html", pdf_name)
Say I have a folder folder1 with excel files, their filenames share same structures: city, building name and id, I want save them in dataframe and then excel file. Please note I also need to append other folders' excel filenames in result.
bj-LG center-101012.xlsx
sh-ABC tower-1010686.xlsx
bj-Jinzhou tower-101018.xlsx
gz-Zijin building-101012.xls
...
The first method I have tried:
import os
import pandas as pd
from pandas import DataFrame, ExcelWriter
path = os.getcwd()
file = [".".join(f.split(".")[:-1]) for f in os.listdir() if os.path.isfile(f)] #exclude files' extension
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#print(city)
df = pd.DataFrame(columns = ['city', 'building name', 'id'])
df['city'] = city
df['building name'] = projectName
df['id'] = projectID
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
Problem:
Traceback (most recent call last):
File "<ipython-input-203-c09878296e72>", line 9, in <module>
city = file.split('-')[0]
AttributeError: 'list' object has no attribute 'split'
My second method:
for root, directories, files in os.walk(path):
#print(root)
for file in files:
if file.endswith('.xlsx') or file.endswith('.xls'):
#print(file)
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#print(city)
df = pd.DataFrame(columns = ['city', 'building name', 'id'])
df['city'] = city
df['building name'] = projectName
df['id'] = projectID
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
I got an empty test.xlsx file, how could I make it works? Thanks.
This splits off the file extension, then unpacks the split into the vairables.
Creates a dictionary then appends the dictionary to the dataframe.
files = [
"bj-LG center-101012.xlsx",
"sh-ABC tower-1010686.xlsx",
"bj-Jinzhou tower-101018.xlsx",
"gz-Zijin building-101012.xls"]
df = pd.DataFrame()
for file in files:
filename = file.split(".")[0]
city, projectName, projectID = filename.split("-")
d = {'city':city,'projectID':projectID,'projectName':projectName}
df = df.append(d,ignore_index=True)
df.to_excel('summary.xlsx')
Method 2 is close.
You need to create the dataframe before the for loops. After your variable assignments, make a dictionary of the variables and append it to the dataframe.
There is also probably a better way to find your file list using glob, but i will just work with what you have already done.
df = pd.DataFrame()
for root, directories, files in os.walk(path):
for file in files:
if file.endswith('.xlsx') or file.endswith('.xls'):
#print(file)
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#append data inside inner loop
d = {'city':city, 'building name':projectname, 'id':projectID}
df.append(d)
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
This should works, thanks to the hint of use glob from #Dan Wisner
import os
from glob import glob
fileNames = [os.path.splitext(val)[0] for val in glob('*.xlsx') or glob('*.xls')]
df = pd.DataFrame({'fileNames': fileNames})
df[['city', 'name', 'id']] = df['fileNames'].str.split('-', n=2, expand=True)
del df['fileNames']
writer = pd.ExcelWriter("C:/Users/User/Desktop/test1.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()