Read CSV starting with string from Zipfile - python

I'm trying to loop through a folder that has zip files in it, and only extracting the csv files that start with a certain prefix.
Here is the code:
for name in glob.glob(path + '/*.zip'):
zf = zipfile.ZipFile(name)
csv_file = pd.read_csv(zf.open('Common_MarketResults*.csv'))
df = pd.concat(csv_file, axis=0).reset_index()
The csv file has some dates after the string I am using, which will be different in every zip file. I am receiving the following error message:
KeyError: "There is no item named 'Common_MarketResults*.csv' in the archive"

Searching for substrings in the filename made this possible.
sub = 'Common_MarketResults'
suf = 'csv'
data = []
for name in glob.glob(path + '*.zip'):
zf = zipfile.ZipFile(name)
zf_nfo = zipfile.ZipFile(name).namelist()
for s in zf_nfo:
if sub in s and suf in s:
csv_file_str = s
csv_file = pd.read_csv(zf.open(csv_file_str))
csv_file['file_name'] = csv_file_str
data.append(csv_file)

Related

Pyinstaller one file works in python shell but fails as an exe

I have a script that takes a few csv files and concat them, it works as intended when running it in the python shell, but it fails when I make an one file exe with pyinstaller.
This is the error I get when I run my script:
The part that seems to fail is this part:
# use glob to get all the csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))
df_list= list()
#format columns
dict_conv={'line_item': lambda x: str(x),
'column_item': lambda x: str(x)}
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f, sep=";", converters = dict_conv, encoding='latin1') #test latin1
df_list.append(df)
#print the location and filename
print('Location:', f)
print('File Name:', f.split("\\")[-1])
#add data frames to a list
RLI_combined = pd.concat(df_list, axis=0)
This is my whole for context script:
# import necessary libraries
import pandas as pd
import os
import glob
from datetime import datetime
#Set filename
file_name = 'daglig_LCR_RLI'
# in the folder
path = os.path.dirname(os.path.abspath(__file__))
# Delete CSV file
# first check whether file exists or not
# calling remove method to delete the csv file
# in remove method you need to pass file name and type
del_file = path+"\\" + file_name +'.csv'
## If file exists, delete it ##
if os.path.isfile(del_file):
os.remove(del_file)
print("File deleted")
else: ## Show an error ##
print("File not found: " +del_file)
# use glob to get all the csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))
df_list= list()
#format columns
dict_conv={'line_item': lambda x: str(x),
'column_item': lambda x: str(x)}
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f, sep=";", converters = dict_conv, encoding='latin1') #test latin1
df_list.append(df)
#print the location and filename
print('Location:', f)
print('File Name:', f.split("\\")[-1])
#add data frames to a list
RLI_combined = pd.concat(df_list, axis=0)
#Write date to approval_text
now = datetime.now()
# dd/mm/YY
print_date = now.strftime("%d/%m/%Y")
RLI_combined.loc[:, 'approval_text'] = print_date
#replace value_text with n/a
RLI_combined.loc[:, 'value_text'] = "n/a"
#Sum columns
m = RLI_combined['column_item'].isin(['0030', '0050', '0080'])
RLI_combined_sum = RLI_combined[~m].copy()
RLI_combined_sum['amount'] = RLI_combined_sum.groupby(['report_name', 'line_item', 'column_item'])['amount'].transform('sum')
RLI_combined_sum = RLI_combined_sum.drop_duplicates(['report_name', 'line_item', 'column_item'])
RLI_combined = pd.concat([RLI_combined_sum, RLI_combined[m]])
#export to csv
RLI_combined.to_csv(path + "//" + file_name + '.csv', index=False, sep=";", encoding='latin1')
#Make log
# Create the directory
directory = "Log"
parent_dir = path
# Path
path_log = os.path.join(parent_dir, directory)
try:
os.mkdir(path_log)
print('Log folder dannet')
except OSError as error:
print('Log folder eksisterer')
#export to csv
log_name = now.strftime("%d-%m-%Y_%H-%M-%S")
print(log_name)
RLI_combined.to_csv(path + "//" + 'Log' +"//" + file_name+'_' + log_name + '.csv', index=False, sep=";", encoding='latin1')
I hope you can point me in the right direction.
with the pyinstaller one file executable you will often ran into problems like that. When starting the *.exe it is extracted to a temporary directory and this is for example the start-location for relative path definitions.
So even if you get your script running and export your *.csv it will often be somewhere on your HD and not at the place of the *.exe where you perhaps expect it to be.
I think in your case the variable df_list stays empty because there are no files listed in csv_files. This is because in the temp dir (location is written in the top of the output) there are no *.csv files.
Please try printing the content of csv_files when running the one-file *.exe if this is the right guess
If this is the case start by running a one-dir *.exe and if this works you know that you have a problem with your path definitions

Issues opening and reading txt files in a zip folder in Pandas

My current code is only opening one txt file in a zip folder when there are 4 txt files. I want to read in those txt files to a csv but unsure why it's not reading all of them. I am going to assume it's due to zip_csv_file = zip_csv_files[0] but I am unsure how I can modify my current code to parse all the .txt files in a given zip folder.
Code:
def custom_parse(self, response):
self.logger.info(response.url)
links = response.xpath("//a[contains(#href, '.zip')]/#href").getall()
for link in list(set(links)):
print(link)
local_path = self.download_file("https://www.sec.gov" + link)
zip_file = zipfile.ZipFile(local_path)
zip_csv_files = [file_name for file_name in zip_file.namelist() if file_name.endswith(".txt") and "pre" not in file_name]
zip_csv_file = zip_csv_files[0]
with zip_file.open(zip_csv_file, "r") as zip:
df = pd.read_csv(BytesIO(zip.read()), dtype=str, sep='\t')
df = self.standardized(df)
for k, row in df.iterrows():
yield dict(row)
Edit:
with zip_file.open(zip_csv_file, "r") as zip:
UnboundLocalError: local variable 'zip_csv_file' referenced before assignment
You can try like this if you want data from all the files into a single data frame:
path = <path to the zip file>
df = pd.concat(
[pd.read_csv(zipfile.ZipFile(path).open(text_file))
for text_file in zipfile.ZipFile(path).infolist()
if text_file.filename.endswith('.txt') and "pre" not in text_file.filename],
ignore_index=True
)
If you want each file as a different data frame:
path = <path to the zip file>
zip_file = zipfile.ZipFile(path)
dfs = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
for text_file in zip_file.infolist()
if text_file.filename.endswith('.txt') and "pre" not in text_file.filename}
This will give you a dict of data frames, with key as the filename

Is there a way to prevent a backslash ('\\') from reduplicating to '\\\\'?

I have a regular structure of archives with the format pathtofile\folder.zip, and I need to access a file within each archive with the format ```folder\image.png`.
The code I am using to achieve this is:
import zipfile
imgname = path[:-4].split('\\')[-1] + '\\image.png'
imgfile = archive.extract(imgname, path=dst)`
Where path = pathtofile\\folder.zip
However this is throwing the error below:
KeyError: "There is no item named 'folder\\\\image.png' in the archive"
I have also tried removing the \\ from the imgname definition, but this throws the following error:
KeyError: "There is no item named 'folderimage.png' in the archive"
so it seems like the backslashes really are reduplicating themselves somehow.
I'm wondering what could be causing the backslashes to multiply; if anyone has any wise words they would be greatly appreciated!
Larger part of the code:
archives = dict()
sub_data = data['file_path'] # data is a pandas dataframe containing file names and data about each file
filenames = get_unique(list(sub_data)) # get_unique is equivalent to list(set(sequence))
for filename in filenames:
zip_path = image_dir+filename+'.zip'
fil = sub_data == filename
frames = [f for f in data.loc[fil, 'local_index']]
archives.update({zip_path: frames})
keys = list(archives.keys())
for i in range(len(filenames)):
key = keys[i]
archive = zipfile.ZipFile(key, 'r')
dst = output_image_dir+'\\'+filenames[i] # output_image_dir is a filepath not ending in '\\'
if not os.path.exists(dst):
os.makedirs(dst)
for frame in archives[key]:
imgname = os.path.join(key[:-4].split('\\')[-1], 'frame'+str(frame).zfill(7)+'.png')
if not os.path.exists(dst+'\\'+imgname):
imgfile = archive.extract(imgname, path=dst)

Is there a way to load data from all files in a directory using Python?

My question: Is there a way to load data from all files in a directory using Python
Input: Get all files in a given directory of mine (wow.txt, testting.txt,etc.)
Process: I want to run all the files through a def function
Output: I want the output to be all the files names and their respective content below it.For example:
/home/file/wow.txt
"all of its content"
/home/file/www.txt
"all of its content"
Here is my code:
# Import Functions
import os
import sys
# Define the file path
path="/home/my_files"
file_name="wow.txt"
#Load Data Function
def load_data(path,file_name):
"""
Input : path and file_name
Purpose: loading text file
Output : list of paragraphs/documents and
title(initial 100 words considered as title of document)
"""
documents_list = []
titles=[]
with open( os.path.join(path, file_name) ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
#Output
load_data(path,file_name)
Here is my output:
My Problem is that my output only takes one file and shows its content. Obviously, i defined the path and file name in my code to one file but I am confused as to how to write the path in a way to load all the files and output each of its contents separately. Any suggestions?
Using glob:
import glob
files = glob.glob("*.txt") # get all the .txt files
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Using os.listdir():
import os
arr = os.listdir()
files = [x for x in arr if x.endswith('.txt')]
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Try this:
import glob
for file in glob.glob("test/*.xyz"):
print(file)
if my directory name was "test" and I had lots of xyz files in them...
You can use glob and pandas
import pandas as pd
import glob
path = r'some_directory' # use your path
all_files = glob.glob(path + "/*.txt")
li = []
for filename in all_files:
#read file here
# if you decide to use pandas you might need to use the 'sep' paramaeter as well
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
# get it all together
frame = pd.concat(li, axis=0, ignore_index=True)
I will take advantage of the function you have already written, so use the following:
data = []
path="/home/my_files"
dirs = os.listdir( path )
for file in dirs:
data.append(load_data(path, file))
In this case you will have all data in the list data.
Hi you can use a for loop on a listdir:
os.listdir(<path of your directory>)
this gives you the list of files in your directory, but this gives you also the name of folders in that directory
Try generating a file list first, then passing that to a modified version of your function.
def dir_recursive(dirName):
import os
import re
fileList = list()
for (dir, _, files) in os.walk(dirName):
for f in files:
path = os.path.join(dir, f)
if os.path.exists(path):
fileList.append(path)
fList = list()
prog = re.compile('.txt$')
for k in range(len(fileList)):
binMatch = prog.search(fileList[k])
if binMatch:
fList.append(binMatch.string)
return fList
def load_data2(file_list):
documents_list = []
titles=[]
for file_path in file_list:
with open( file_path ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
# Generate a file list & load the data from it
file_list = dir_recursive(path)
documents_list, titles = load_data2(file_list)

Python: Collect all xlsx. - Open - Add Column - Save in new folder

I am new with Python but trying to write a code which add a column on multiple .xlsx files and saves this files with the origin name to a new folder.
I have started with some coding beneath, but missing some code in open all files and saving to my DestPath. Would be pleased if any has a solution for this:
from os import listdir, path
import pandas as pd
import xlrd
SourcePath = 'C:\' #Source Path
DestPath = 'C:\' #Destination Path
# Listing up all .xlsx files from Source
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
filenames = find_xlsx_filenames(SourcePath)
fname = path.join(SourcePath, filenames[0]) # Tar første fil i mappa.
outname = path.join(outputdata, filenames[0])
for i in range(len(filenames)):
fname = path.join(SourcePath, filenames[i])
df = pd.read_excel(fname) #Read Excel file as a DataFrame
df['new_col'] = 'Sort Data' #Adding a new column named <Sort Data>
#To save it back as Excel
df.to_excel(DestPath, outname) #Write DateFrame back as Excel file
Thanks in Advance
check if this works
import os
import pandas as pd
path = 'C:/'
for roots, dirs, files in os.walk(path):
xlsfile = [ _ for _ in files if _.endswith('.xlsx')]
for xlsf in xlsfile:
df = pd.read_excel(os.path.join(roots, xlsf))
df['Sort Data'] = ' '
df.to_excel(os.path.join(roots, xlsf), index = False)

Categories

Resources