Issues opening and reading txt files in a zip folder in Pandas - python

My current code is only opening one txt file in a zip folder when there are 4 txt files. I want to read in those txt files to a csv but unsure why it's not reading all of them. I am going to assume it's due to zip_csv_file = zip_csv_files[0] but I am unsure how I can modify my current code to parse all the .txt files in a given zip folder.
Code:
def custom_parse(self, response):
self.logger.info(response.url)
links = response.xpath("//a[contains(#href, '.zip')]/#href").getall()
for link in list(set(links)):
print(link)
local_path = self.download_file("https://www.sec.gov" + link)
zip_file = zipfile.ZipFile(local_path)
zip_csv_files = [file_name for file_name in zip_file.namelist() if file_name.endswith(".txt") and "pre" not in file_name]
zip_csv_file = zip_csv_files[0]
with zip_file.open(zip_csv_file, "r") as zip:
df = pd.read_csv(BytesIO(zip.read()), dtype=str, sep='\t')
df = self.standardized(df)
for k, row in df.iterrows():
yield dict(row)
Edit:
with zip_file.open(zip_csv_file, "r") as zip:
UnboundLocalError: local variable 'zip_csv_file' referenced before assignment

You can try like this if you want data from all the files into a single data frame:
path = <path to the zip file>
df = pd.concat(
[pd.read_csv(zipfile.ZipFile(path).open(text_file))
for text_file in zipfile.ZipFile(path).infolist()
if text_file.filename.endswith('.txt') and "pre" not in text_file.filename],
ignore_index=True
)
If you want each file as a different data frame:
path = <path to the zip file>
zip_file = zipfile.ZipFile(path)
dfs = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
for text_file in zip_file.infolist()
if text_file.filename.endswith('.txt') and "pre" not in text_file.filename}
This will give you a dict of data frames, with key as the filename

Related

Extract File names from directory and classify on the basis of its extension in excel | Using PYTHON

I am trying to classify based on file extension from a local directory to excel sheet.
Like my input should be:
Directory path
My output should be:
excel sheet with different sheets based on extension.
Like if the input directory is having 5 .sh files, 8 .py file and so on.
On the basis of extension, sheets should be created with file names.
I am able to achieve the same but it is a bit hard coded.
Any help would be appreciated if it can be automated with hard code:
Below is the code i tried and its working fine:
import glob
import pandas as pd
path = r'<path_name>' #base path
files = glob.glob(path + '/**/*.*', recursive=True)
hql, hive, ksh, sh, csv, txt, sql,py = ([] for i in range(8))
for fpath in files:
chk_file = fpath.split('\\')
for file_name in chk_file:
if '.hql' in file_name:
print("Hql:",file_name)
comb = f'{file_name}'
hql.append(comb)
if '.hive' in file_name:
print(file_name)
comb = f'{file_name}'
hive.append(comb)
if '.ksh' in file_name:
print(file_name)
comb = f'{file_name}'
ksh.append(comb)
if '.sh' in file_name:
print(file_name)
comb = f'{file_name}'
sh.append(comb)
if '.sql' in file_name:
print(file_name)
comb = f'{file_name}'
sql.append(comb)
if '.txt' in file_name:
print(file_name)
comb = f'{file_name}'
txt.append(comb)
if '.csv' in file_name:
print(file_name)
comb = f'{file_name}'
csv.append(comb)
if '.py' in file_name:
print(file_name)
comb = f'{file_name}'
py.append(comb)
writer = pd.ExcelWriter(r'C:\Users\saurabh.arun.kumar\OneDrive - Accenture\Desktop\outfile2.xlsx',
engine='xlsxwriter')
new_hql = pd.DataFrame(hql,columns=['file'])
new_hive = pd.DataFrame(hive,columns=['file'])
new_sql = pd.DataFrame(sql,columns=['file'])
new_ksh = pd.DataFrame(ksh,columns=['file'])
new_txt = pd.DataFrame(txt,columns=['file'])
new_sh = pd.DataFrame(sh,columns=['file'])
new_csv = pd.DataFrame(csv,columns=['file'])
new_py = pd.DataFrame(py,columns=['file'])
new_hql.to_excel(writer, sheet_name='hql', index=False)
new_hive.to_excel(writer, sheet_name='hive', index=False)
new_sql.to_excel(writer, sheet_name='sql', index=False)
new_ksh.to_excel(writer, sheet_name='ksh', index=False)
new_csv.to_excel(writer, sheet_name='csv', index=False)
new_txt.to_excel(writer, sheet_name='txt', index=False)
new_sh.to_excel(writer, sheet_name='sh', index=False)
new_py.to_excel(writer, sheet_name='py', index=False)
writer.save()
writer.close()
print ("Executed")
This code will work with the extension provided in the code. And i want it should classify by its own reading the extension and created new sheets with the file names.
Hope i am able to explain the scenario.
You can split the extension from a files path by using
fname, fext = os.path.splitext("/what/ever/kind/of/file/this.is.txt")
Use that to create a dict of "ext" -> "list of files".
Use the dict to create n dataframes. Write them to excel.
If you only want certain extensions, filter the dict-keys to those you want:
import glob
import pandas as pd
from os import path
p = r'/redacted/location' # fix this to your path
files = glob.glob(p + '/**/*.*', recursive=True)
d = {}
i = 0 # used to redact my file names - you would simply store fn+fex
for f in files:
fn, fex = path.splitext(f)
# filter for extensions you want
if (fex in (".txt",".xlsx", ".docx") ):
# use d.setdefault(fex,[]).append(f) - I use something
# to blank out my file names here
# use collections.defaultdict to get a speed kick if needed
d.setdefault(fex,[]).append(f"file...{i}{fex}")
i += 1
# create single data frames per file extension from dictionary
dfs = []
for key,value in d.items():
df = pd.DataFrame({key:value})
dfs.append(df)
# do your excel writing here - use column header for sheet name etc.
for df in dfs:
print (df)
Output (files/names redacted):
.docx
0 file...0.docx
1 file...2.docx
2 file...3.docx
3 file...4.docx
4 file...5.docx
5 file...6.docx
6 file...7.docx
7 file...12.docx
8 file...13.docx
9 file...14.docx
10 file...15.docx
11 file...16.docx
.xlsx
0 file...1.xlsx
1 file...8.xlsx
2 file...9.xlsx
3 file...10.xlsx
4 file...11.xlsx
5 file...17.xlsx
You can then use the column header of each single DF to write your excel sheet - something akin to:
with pd.ExcelWriter('C:/temp/outfile2.xlsx') as writer:
for df in dfs:
df.to_excel(writer, sheet_name = df.columns[0])
should do it - can't test that right now.

How to unzip and parse multiple files in Scrapy

I am trying to parse a list of .txt files within a zip folder but it's only parsing one file from that list
Code:
def custom_parse(self, response):
self.logger.info(response.url)
links = response.xpath("//a[contains(#href, '.zip')]/#href").getall()
for link in list(set(links)):
print(link)
local_path = self.download_file("https://www.sec.gov" + link)
zip_file = zipfile.ZipFile(local_path)
zip_csv_files = [file_name for file_name in zip_file.namelist() if file_name.endswith(".txt") and "pre" not in file_name]
zip_csv_file = zip_csv_files[0]
with zip_file.open(zip_csv_file, "r") as zip:
# df = pd.read_csv(BytesIO(zip.read()), dtype=object)
df = pd.read_csv(zip, dtype=object, header=None, sep='delimiter')
df = self.standardized(df)
for k, row in df.iterrows():
yield dict(row)
def standardized(self, df):
# df.columns = [col.lower().strip().replace(" ", "_") for col in df.columns]
df = df.fillna('')
return df
I am going to assume it's due to zip_csv_file = zip_csv_files[0] but I am unsure how I can modify my current code to parse all the .txt files in a given zip folder.
You already pull out all the .txt files with your list comprehension, so just read those in a loop and concatenate them. This is untested, but should be close
replace the appropriate section of your code with this:
UPDATE:
zip_file = zipfile.ZipFile(local_path)
text_files = zip_file.infolist()
df_list =[]
for file_name in text_files:
if file_name.filename.endswith(".txt") and "pre" not in file_name.filename:
df_list.append(pd.read_csv(zip_file(open(file_name.filename)), dtype=object, header=None, sep='delimiter'))
df = pd.concat(df_list)
df = self.standardized(df)

zipped multiple excel files than merge its content into one file using python

I am trying to create 2 functions with python
first function zip multiple excel files that exist in the given
path.
second function read the content of the zip file than merge all
existing file into one excel file.(all files has same structure.)
The problem is that when i run the script it crashs when it comes to read the zip file and display the below error:
AttributeError: 'ZipFile' object has no attribute 'seek'
code:
import pandas as pd
import numpy as np
import zipfile
import os
def get_all_file_path(directory):
file_paths=[]
for root,directories,files in os.walk(directory):
for filename in files:
filepath = os.path.join(root,filename)
file_paths.append(filepath)
return file_paths
# Excel file merge function
def excel_file_merge(zip_file_name):
df = pd.DataFrame()
archive = zipfile.ZipFile(zip_file_name, 'r')
with zipfile.ZipFile(zip_file_name, "r") as f:
for file in f.namelist():
xlfile = archive.open(file)
if file.endswith('.xlsx'):
# Add a note indicating the file name that this dataframe originates from
df_xl = pd.read_excel(xlfile, engine='openpyxl')
df_xl['Note'] = file
# Appends content of each Excel file iteratively
df = df.append(df_xl, ignore_index=True)
return df
uploaded_file = 'F:/AIenv/test_zip'
file_paths = get_all_file_path(uploaded_file)
print("following files will be zipped: ")
for file_name in file_paths:
print(file_name)
with zipfile.ZipFile("my _python_files.zip","w")as f:
for file in file_paths:
f.write(file)
f.close()
print("All Files Zipped successfully")
df = excel_file_merge(f)
print(df)

Is there a way to load data from all files in a directory using Python?

My question: Is there a way to load data from all files in a directory using Python
Input: Get all files in a given directory of mine (wow.txt, testting.txt,etc.)
Process: I want to run all the files through a def function
Output: I want the output to be all the files names and their respective content below it.For example:
/home/file/wow.txt
"all of its content"
/home/file/www.txt
"all of its content"
Here is my code:
# Import Functions
import os
import sys
# Define the file path
path="/home/my_files"
file_name="wow.txt"
#Load Data Function
def load_data(path,file_name):
"""
Input : path and file_name
Purpose: loading text file
Output : list of paragraphs/documents and
title(initial 100 words considered as title of document)
"""
documents_list = []
titles=[]
with open( os.path.join(path, file_name) ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
#Output
load_data(path,file_name)
Here is my output:
My Problem is that my output only takes one file and shows its content. Obviously, i defined the path and file name in my code to one file but I am confused as to how to write the path in a way to load all the files and output each of its contents separately. Any suggestions?
Using glob:
import glob
files = glob.glob("*.txt") # get all the .txt files
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Using os.listdir():
import os
arr = os.listdir()
files = [x for x in arr if x.endswith('.txt')]
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Try this:
import glob
for file in glob.glob("test/*.xyz"):
print(file)
if my directory name was "test" and I had lots of xyz files in them...
You can use glob and pandas
import pandas as pd
import glob
path = r'some_directory' # use your path
all_files = glob.glob(path + "/*.txt")
li = []
for filename in all_files:
#read file here
# if you decide to use pandas you might need to use the 'sep' paramaeter as well
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
# get it all together
frame = pd.concat(li, axis=0, ignore_index=True)
I will take advantage of the function you have already written, so use the following:
data = []
path="/home/my_files"
dirs = os.listdir( path )
for file in dirs:
data.append(load_data(path, file))
In this case you will have all data in the list data.
Hi you can use a for loop on a listdir:
os.listdir(<path of your directory>)
this gives you the list of files in your directory, but this gives you also the name of folders in that directory
Try generating a file list first, then passing that to a modified version of your function.
def dir_recursive(dirName):
import os
import re
fileList = list()
for (dir, _, files) in os.walk(dirName):
for f in files:
path = os.path.join(dir, f)
if os.path.exists(path):
fileList.append(path)
fList = list()
prog = re.compile('.txt$')
for k in range(len(fileList)):
binMatch = prog.search(fileList[k])
if binMatch:
fList.append(binMatch.string)
return fList
def load_data2(file_list):
documents_list = []
titles=[]
for file_path in file_list:
with open( file_path ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
# Generate a file list & load the data from it
file_list = dir_recursive(path)
documents_list, titles = load_data2(file_list)

Read CSV starting with string from Zipfile

I'm trying to loop through a folder that has zip files in it, and only extracting the csv files that start with a certain prefix.
Here is the code:
for name in glob.glob(path + '/*.zip'):
zf = zipfile.ZipFile(name)
csv_file = pd.read_csv(zf.open('Common_MarketResults*.csv'))
df = pd.concat(csv_file, axis=0).reset_index()
The csv file has some dates after the string I am using, which will be different in every zip file. I am receiving the following error message:
KeyError: "There is no item named 'Common_MarketResults*.csv' in the archive"
Searching for substrings in the filename made this possible.
sub = 'Common_MarketResults'
suf = 'csv'
data = []
for name in glob.glob(path + '*.zip'):
zf = zipfile.ZipFile(name)
zf_nfo = zipfile.ZipFile(name).namelist()
for s in zf_nfo:
if sub in s and suf in s:
csv_file_str = s
csv_file = pd.read_csv(zf.open(csv_file_str))
csv_file['file_name'] = csv_file_str
data.append(csv_file)

Categories

Resources