combining multiple sheets on excel - python

Hi I am trying to combine all the excel sheets from one folder and I always got the below error. 'TypeError: listdir: path should be string, bytes, os.PathLike or None, not list. please help.
import pandas as pd
import os
path = ['pythonProject']
combine = pd.DataFrame()
#2nd
for j in os.listdir(path):
df = pd.read_excel(path, skiprows=3)
combine = combine.append(df, ignore_index=True)
print(combine)

you accidently made path a list because of [ and ]. try the following code
import pandas as pd
import os
path = 'pythonProject'
combine = pd.DataFrame()
#2nd
for j in os.listdir(path):
df = pd.read_excel(path, skiprows=3)
combine = combine.append(df, ignore_index=True)
print(combine)

Related

Concatenate multiple csv files from different folders into one csv file in python

I am trying to concatenate multiple csv files into one file(about 30 files). All csv files are located in different folders.
However, I have encountered an error while appending all files together: OSError: Initializing from file failed
Here is my code:
import pandas
import glob
path = 'xxx'
target_folders=['Apples', 'Oranges', 'Bananas','Raspberry','Strawberry', 'Blackberry','Gooseberry','Liche']
output ='yyy'
path_list = []
for idx in target_folders:
lst_of_files = glob.glob(path + idx +'\\*.csv')
latest_files = max(lst_of_files, key=os.path.getmtime)
path_list.append(latest_files)
df_list = []
for file in path_list:
df = pd.read_csv(file)
df_list.append(df)
final_df = df.append(df for df in df_list)
combined_csv = pd.concat([pd.read_csv(f) for f in latest_files])
combined_csv.to_csv(output + "combined_csv.csv", index=False)
OSError Traceback (most recent call last)
<ipython-input-126-677d09511b64> in <module>
1 df_list = []
2 for file in latest_files:
----> 3 df = pd.read_csv(file)
4 df_list.append(df)
5 final_df = df.append(df for df in df_list)
OSError: Initializing from file failed
This solution should work as a charm to you:
import pandas as pd
import pathlib
data_dir = '/Users/thomasbryan/projetos/blocklist/files/'
out_dir = '.'
list_files = []
for filename in pathlib.Path(data_dir).glob('**/*.csv'):
list_files.append(filename)
df = pd.concat(map(pd.read_csv, list_files), ignore_index=True)
df.to_csv(pathlib.Path(out_dir) / 'combined_csv.csv', index=False)
Try to simplify your code:
import pandas as pd
import pathlib
data_dir = 'xxx'
out_dir = 'yyy'
data = []
for filename in pathlib.Path(data_dir).glob('**/*.csv'):
df = pd.read_csv(filename)
data.append(df)
df = pd.concat(df, ignore_index=True)
df.to_csv(pathlib.Path('out_dir') / 'combined_csv.csv', index=False)
Without seeing your CSV file it's hard to be sure, but I've come across this problem before with unusually formatted CSVs. The CSV parser may be having difficulty in determine the structure of the CSV files, separators etc.
Try df = pd.read_csv(file, engine = 'python')
From the docs: "The C engine is faster while the python engine is currently more feature-complete."
Try passing the engine = 'python' argument on reading a single CSV file and see if you get a successful read. That way you can narrow down the problem to either file reads or traversing the files.

Pandas generating an empty csv while trying combine all csv's into one csv

I am writing a python script that will read all the csv files in the current location and merge them into a single csv file. Below is my code:-
import os
import numpy as np
import pandas as pd
import glob
path = os.getcwd()
extension = csv
os.chdir(path)
tables = glob.glob('*.{}'.format(extension))
data = pd.DataFrame()
for i in tables:
try:
df = pd.read_csv(r''+path+'/'+i+'')
# Here I want to create an index column with the name of the file and leave that column empty
df[i] = np.NaN
df.set_index(i, inplace=True)
# Below line appends an empty row for easy differentiation
df.loc[df.iloc[-1].name+1,:] = np.NaN
data = data.append(df)
except Exception as e:
print(e)
data.to_csv('final_output.csv', indexx=False, header=None)
If I remove the below lines of code then it works:-
df[i] = np.NaN
df.set_index(i, inplace=True)
But I want to have the first column name as the name of the file and its values NaN or empty.
I want the output to look something like this:-
I tend to avoid the .append method in favor of pandas.concat
Try this:
import os
from pathlib import Path
import pandas as pd
files = Path(os.getcwd()).glob('*.csv')
df = pd.concat([
pd.read_csv(f).assign(filename=f.name)
for f in files
], ignore_index=True)
df.to_csv('alldata.csv', index=False)

Adding dataframe column names based on filename after merging using Glob

I have Excel files in a folder, all in the same format with data for all countries in the world in the sheet 'Dataset2' in each file.
I have merged all files together into one using glob, but I need to know which file (i.e. which country) each column comes from.
Is there a way to do this?
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined = pd.concat([pd.read_excel(f, sheet_name='Dataset2') for f in all_filenames ],axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
You could unpack the list comprehension into a for-loop and add an additional column to each data file, something like this:
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
file_list = []
for f in all_filenames:
data = pd.read_excel(f, sheet_name='Dataset2')
data['source_file'] = f # create a column with the name of the file
file_list.append(data)
combined = pd.concat(file_list, axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
if you're using os module try path.basename and adding this to the key argument in concat:
import glob
import os
import pandas as pd
os.chdir(r"C:\Users\Umar.Hussain\OneDrive - Ricoh Europe PLC\Documents\Excels")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
names = [os.path.basename(f) for f in all_filenames]
combined = pd.concat([pd.read_excel(f, sheet_name='Sheet1') for f in all_filenames],keys=names,axis=1 )
as your using axis=1 this will add the keys to the header, so may want to read the excels first and add it to a list like :
dfs = []
for file in all_filenames:
df = pd.read_excel(file)
df['source'] = os.path.basename(file)
dfs.append(df)

How to concat multiple spreadsheets in Excel workbooks into pandas dataframe?

I have multiple folders and subfolders, containing Excel workbooks with multiple tabs. How do I concat all the information into 1 pandas dataframe?
Here is my code so far:
from pathlib import Path
import os
import pandas as pd
import glob
p = Path(r'C:\Users\user1\Downloads\key_folder')
globbed_files = p.glob('**/**/*.xlsx')
df = []
for file in globbed_files:
frame = pd.read_excel(file, sheet_name = None, ignore_index=True)
frame['File Path'] = os.path.basename(file)
df.append(frame)
# df = pd.concat([d.values() for d in df], axis = 0, ignore_index=True)
df = pd.concat(df, axis=0, ignore_index = True)
This is generating the following error:
cannot concatenate object of type "<class 'collections.OrderedDict'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid
When I ran pd.DataFrame(df), I saw that each Excel spreadsheet tab is a separate column. The cells contain the data and headers in text form, forming a really long string.
Any help is appreciated! Thank you!
Here is the final code:
from pathlib import Path
import os
import pandas as pd
import glob
import xlrd
p = Path('path here')
globbed_files = p.glob('**/**/*.xlsx')
list_dfs = []
dfs = []
for file in globbed_files:
xls = xlrd.open_workbook(file, on_demand=True)
for sheet_name in xls.sheet_names():
df = pd.read_excel(file,sheet_name)
df['Sheet Name'] = sheet_name
list_dfs.append(df)
dfs = pd.concat(list_dfs,axis=0)
dfs.to_excel('merged spreadsheet.xlsx')

Changing Column Heading CSV File

I am currently trying to change the headings of the file I am creating. The code I am using is as follows;
import pandas as pd
import os, sys
import glob
path = "C:\\Users\\cam19\\Desktop\\Test1\\*.csv"
list_=[]
for fname in glob.glob(path):
df = pd.read_csv(fname, dtype=None, low_memory=False)
output = (df['logid'].value_counts())
list_.append(output)
df1 = pd.DataFrame()
df2 = pd.concat(list_, axis=1)
df2.to_csv('final.csv')
Basically I am looping through a file directory and extracting data from each file. Using this is outputs the following image;
http://imgur.com/a/LE7OS
All i want to do it change the columns names from 'logid' to the file name it is currently searching but I am not sure how to do this. Any help is great! Thanks.
Instead of appending the values try to append values by creating the dataframe and setting the column i.e
output = pd.DataFrame(df['value'].value_counts())
output.columns = [os.path.basename(fname).split('.')[0]]
list_.append(output)
Changes in the code in the question
import pandas as pd
import os, sys
import glob
path = "C:\\Users\\cam19\\Desktop\\Test1\\*.csv"
list_=[]
for fname in files:
df = pd.read_csv(fname)
output = pd.DataFrame(df['value'].value_counts())
output.columns = [os.path.basename(fname).split('.')[0]]
list_.append(output)
df2 = pd.concat(list_, axis=1)
df2.to_csv('final.csv')
Hope it helps

Categories

Resources