Read multiple file in python and generate one output - python

I have a python script for generating 1 upload file from 1 input file.
The thing is that the input files have started coming in batches, 30-50 at one time.
e.g.:
1111.xlsx --> upload.xlsx
1125.xlsx --> upload.xlsx
1176.xlsx --> upload.xlsx
1322.xlsx --> upload.xlsx
The code just converting the input files in the upload format.
Here's what I have done so far (1 input file -> 1 output file):
def main():
initial_workbook = 'C:/files/1111.xlsx'
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames , inplace=True)
initial_df.dropna(axis=1, how='all')
output = initial_df.to_excel('C:/files/upload_file.xlsx', index = False)
Is there a way to generate one upload file for all the files from the input folder. And once the files input files have been processed, rename them by prefixing x in front of it. e.g. x1111.xlsx

So here is how I will approach, for a given batch:
from datetime import datetime
import os
from pathlib import Path
all_dfs = []
proj_path = Path("C:/files/")
for f in os.listdir(proj_path):
if f.endswith(".xlsx"):
print(f"processing {f}...")
df_tmp = main(proj_path / f)
df_tmp["file_name"] = f
all_dfs.append(df_tmp)
df_all = pd.concat(all_dfs, axis=0)
df_all.to_excel(proj_path / f"{datetime.now()}_batch.xlsx", index = False)
def main(f):
initial_workbook = proj_path / f
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames, inplace=True)
initial_df.dropna(axis=1, how='all', inplace=True)
return initial_df
You can potentially enclose the logic for a batch in a function.

Related

Efficient way to combine multiple csv

I have over 100K CSV (total file size north of 150 GB) which I need to join. All have standard column names although the sequence of columns may not match and some csv have a few columns missing.
Now I just created a dataframe and kept concating the datframe from each csv in each iteration to have a standard dataframe containing all columns which I eventually intended to save as csv
I tried making a dataframe with 1000 sample csv and noticed as the dataframe size increased, the number of iteration dropped down from 10 to 1.5 per second which probably means that it would follow a similar trend if I got all-in with 100k csv thus taking days if not months to combine them.
Is there a better way of combining huge number of csv files?
Here is my code
df_t1 = pd.DataFrame()
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])
if thisCSV.endswith('type1.csv'):
df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Possible improvement
Method 1: Using Pandas
#df_t1 = pd.DataFrame()
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
if thisCSV.endswith('type1.csv'):
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
#df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))]) --unnecessary to loop use next line instead
df["File Name"] = thisCSV # places thisCSV in every row
#df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True) # concat slow, append to list instead
df_t1_lst.append(df)
df_t1 = pd.concat(df_t1_lst, ignore_index=True) # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Method 1a
Using Pandas to continuously append to CSV output file
import os
import pandas as pd
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
write_header = True
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]
output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
pass # create empty output file
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
df = pd.read_csv(input_file)
if df.shape[0] > 0:
df['File Name'] = thisCSV # Add filename
df = df.sort_index(axis = 1) # sort based upon colunn in ascending order
# Append to output file
df.to_csv(output_file, mode='a',
index = False,
header= write_header)
write_header = False # Only write header once
del df
Method 2: Binary Files
Reading/Writing binary and using memory-map should be faster.
from tqdm import tqdm
import os
import mmap
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
header = None
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
with open(outpath + "df_t1.csv", "wb") as ofile:
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
with open(input_file, "rb") as ifile:
print('file ', thisCSV)
# memory-map the file, size 0 means whole file
with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
text_iter = iter(mmap_obj.read().split(b'\n'))
if header is None:
header = next(text_iter)
header = header.rstrip() + str_to_bytes(SEPARATOR + "File Name\n")
ofile.write(header) # write header
else:
next(text_iter) # ignore header row
# write data to output file
file_value = str_to_bytes(SEPARATOR + f"{thisCSV}\n")
for line in text_iter:
if line.strip(): # skip blank lines
ofile.write(line.rstrip() + file_value)

Unable to convert pandas dataframe to csv file

I want to create a dataset.csv file from the various raw files within my input_path. However, my code didn't seem to generate the csv file.
import os
import pandas as pd
from zipfile import ZipFile
import cv2
import re
import csv
from sklearn.dummy import DummyClassifier
input_path = "../input_data/"
class RawToCSV:
def __init__(self, path_, df):
self.df = df
self.measurement_df = None
self.cls = None
self.path_ = path_
def raw_file_processing(self, path_):
# Open all the subfolders within path
for root, dirs, files in os.walk(path_):
for file in files:
with open(os.path.join(root, file), "r") as data:
df = pd.read_csv(data)
# Create the ID series by concatenating columns 1-3
df = df.assign(ID=df[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
df = df.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# The class info is the tile_num (i.e, column 3) - TODO: Check if class info is tile_num
cls_col = df.iloc[2]
# Dummy-code the classes
cls = pd.get_dummies(cls_col)
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in df[9:]:
if re.search(r"(Blank|Empty)$", col):
background = col
else:
line = col.readline()
for data in line:
norm_data = data/ background
self.measurement_df = np.log2(norm_data)
return self.df["ID"], cls, self.measurement_df
def dataset_csv(self):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
ids = self.df["ID"]
id_col = ids.to_frame()
cls_col = self.cls.to_frame()
frames = [id_col, cls_col, self.measurement_df]
dataset_df = pd.concat(frames)
data_csv = dataset_df.to_csv(r"../input_data/dataset.csv")
return data_csv
dataset = RawToCSV.dataset_csv(input_path)
Desired output dataset.csv
ID
Class
measurement_A
measurement_B
Sample1
value
value
value
Sample2
value
value
value
Not sure if you need to treat your url path as a row string. You should simply try:
data_csv = dataset_df.to_csv("../input_data/dataset.csv")
Ps : and as a test, start by saving your csv in the same folder where your script is:
data_csv = dataset_df.to_csv("dataset.csv")

check if the csv file exists and do the condition?

Hi I am working on csv file and I have a data I want to append these data to the csv file. But firstly I want to check if the csv file exists if TRUE then just open the csv file and append the data to csv file and save it, if NOT just create a DataFrame and with these data and save it.
Note: I have a csv file in my I want to append the sample of data to my csv file
thanks in advance.
here is my trying.
#sample of data
ID = 5
img_Latitude = 38786454
img_Longitude = 1118468
meta_lat = 45778
meta_long = 886556
#create a function
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
#check if the file is exists, if True
if os.path.isfile('C:/My/Path/compare_coordinates.csv'):
#read the csv file
df = pd.read_csv('compare_coordinates.csv')
#make pd.series
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
#append the data to df
df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('C:/My/Path/compare_coordinates.csv', index=False)
The line df.append(data, ignore_index = True) needs to be:
df = df.append(data, ignore_index = True)
This is because DatFrame.append returns a new DF with the appended lines, it does not append in-place:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html
To get the values that needed must be saved in variable so for the line
df.append(data, ignore_index = True) to be edited to df = df.append(data, ignore_index = True) and for the getting value of file exists or not as following codes:
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
Path = os.path.isfile('My/path/compare_coordinates1.csv')
if Path==True:
df = pd.read_csv('compare_coordinates1.csv')
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
df = df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('My/path/compare_coordinates1.csv', index=False)

Group values and remove duplicates of groups based on a column in Pandas

I have a datafile which is the result of combining several sources that contain name information. Each name have a unique ID (Column ID).
Sorting the ID by column, I would like to remove the second/third source finding in the column Source.
My output today:
(all the red rows are "duplicates" since we already got them from the first source (blue rows))
What I would like to achieve:
How can I achieve this result?
Is there a way to iterate row by row, where I remove duplicate of ID already when I iterate in the function "for file in files:" part of the code?
Or is it easier to do it in the "df_merged" before I output the dataframe to an an excel file?.
Code:
import pandas as pd
import os
from datetime import datetime
from shutil import copyfile
from functools import reduce
import numpy as np
#Path
base_path = "G:/Till/"
# Def
def get_files(folder, filetype):
list_files = []
directory = os.fsencode(folder)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith("." + filetype.strip().lower()):
list_files.append(filename)
return list_files
# export files
df_result_e = pd.DataFrame()
files = get_files(base_path + "datasource/" + "export","xlsx")
df_append_e = pd.DataFrame()
for file in files:
df_temp = pd.read_excel(base_path + "datasource/" + "export/" + file, "Results", dtype=str, index=False)
df_temp["Source"] = file
df_append_e = pd.concat([df_append_e, df_temp])
df_result_e = pd.concat([df_result_e, df_append_e])
print(df_result_e)
# match files
df_result_m = pd.DataFrame()
files = get_files(base_path + "datasource/" + "match","xlsx")
df_append_m = pd.DataFrame()
for file in files:
df_temp = pd.read_excel(base_path + "datasource/" + "match/" + file, "Page 1", dtype=str, index=False)
df_append_m = pd.concat([df_append_m, df_temp])
df_result_m = pd.concat([df_result_m, df_append_m])
df_result_m = df_result_m[['ID_Our','Name_Our','Ext ID']]
df_result_m.rename(columns={'ID_Our' : 'ID', 'Name_Our' : 'Name' , 'Ext ID' : 'Match ID'}, inplace=True)
df_result_m.dropna(subset=["Match ID"], inplace=True) # Drop all NA
data_frames = [df_result_e, df_result_m]
# Join files
df_merged = reduce(lambda left,right: pd.merge(left, right, on=["Match ID"], how='outer'), data_frames)
#Output of files
df_merged.to_excel(base_path + "Total datasource Export/" + datetime.now().strftime("%Y-%m-%d_%H%M") + ".xlsx", index=False)
For remove them you can try transform with factorize
newdf=df[df.groupby('ID')['Source'].transform(lambda x : x.factorize()[0])==0]

Merging Multiple xlsx Files into one sheet

Currently I am trying to merge multiple excel files into one Using python. What I have so far is as follows:
sharedDocs = "C:\\SPSharedDocuments\\*.xlsx"
invoices = "C:\\SPInvoices\\*.xlsx"
formsCerts = "C:\\SPForms&Certificates\\*.xlsx"
mgmt = "C:\\SPManagement\\*.xlsx"
files = [sharedDocs, invoices, formsCerts, mgmt]
for docs in files:
excel = []
for file in glob.glob(docs):
excel.append(file)
excels = [pd.ExcelFile(name) for name in excel]
frames = [x.parse(x.sheet_names[0], header=None, index_col=None) for x in excels]
frames_new = [df[1:] for df in frames[1:]]
combined = pd.concat(frames_new)
if sharedDocs == docs:
combined.to_excel("SharedDocsMerged.xlsx", header = False, index = False)
elif invoices == docs:
combined.to_excel("InvoicesMerged.xlsx", header = False, index = False)
elif formsCerts == docs:
combined.to_excel("FormsCertsMerged.xlsx", header = False, index = False)
else:
combined.to_excel("MGMTMerged.xlsx", header = False, index = False)
This works but it does not copy the first header so that I know what the name for each column is. Before I had the line that read frames_new = [df[1:] for df in frames[1:]] as frames[1:] = [df[1:] for df in frames[1:]] but this was causing multiple copies of the same file.
All I need is it to copy one header so I know the value of each column.
Your help is much appreciated and thank you in advance.
UPDATE:
I tried using the post that put below suggesting it was a similar question and I edited my code to look like this:
sharedDocs = "C:\\SPSharedDocuments\\*.xlsx"
invoices = "C:\\SPInvoices\\*.xlsx"
formsCerts = "C:\\SPForms&Certificates\\*.xlsx"
mgmt = "C:\\SPManagement\\*.xlsx"
files = [sharedDocs, invoices, formsCerts, mgmt]
for docs in files:
excel = []
for file in glob.glob(docs):
excel.append(pd.read_excel(file))
df = pd.concat(excel, ignore_index=True, sort = True)
if sharedDocs == docs:
df.to_excel("SharedDocsMerged.xlsx", header = False, index = False)
elif invoices == docs:
df.to_excel("InvoicesMerged.xlsx", header = False, index = False)
elif formsCerts == docs:
df.to_excel("FormsCertsMerged.xlsx", header = False, index = False)
else:
df.to_excel("MGMTMerged.xlsx", header = False, index = False)
the result that I get is 2 extra columns on the left, a missing column and still no header.

Categories

Resources