How to unzip and parse multiple files in Scrapy

How to unzip and parse multiple files in Scrapy - python

I am trying to parse a list of .txt files within a zip folder but it's only parsing one file from that list
Code:
def custom_parse(self, response):
self.logger.info(response.url)
links = response.xpath("//a[contains(#href, '.zip')]/#href").getall()
for link in list(set(links)):
print(link)
local_path = self.download_file("https://www.sec.gov" + link)
zip_file = zipfile.ZipFile(local_path)
zip_csv_files = [file_name for file_name in zip_file.namelist() if file_name.endswith(".txt") and "pre" not in file_name]
zip_csv_file = zip_csv_files[0]
with zip_file.open(zip_csv_file, "r") as zip:
# df = pd.read_csv(BytesIO(zip.read()), dtype=object)
df = pd.read_csv(zip, dtype=object, header=None, sep='delimiter')
df = self.standardized(df)
for k, row in df.iterrows():
yield dict(row)
def standardized(self, df):
# df.columns = [col.lower().strip().replace(" ", "_") for col in df.columns]
df = df.fillna('')
return df
I am going to assume it's due to zip_csv_file = zip_csv_files[0] but I am unsure how I can modify my current code to parse all the .txt files in a given zip folder.

You already pull out all the .txt files with your list comprehension, so just read those in a loop and concatenate them. This is untested, but should be close
replace the appropriate section of your code with this:
UPDATE:
zip_file = zipfile.ZipFile(local_path)
text_files = zip_file.infolist()
df_list =[]
for file_name in text_files:
if file_name.filename.endswith(".txt") and "pre" not in file_name.filename:
df_list.append(pd.read_csv(zip_file(open(file_name.filename)), dtype=object, header=None, sep='delimiter'))
df = pd.concat(df_list)
df = self.standardized(df)

Related

how do i select last raw in CSV files and import in to Excel?

How i can select last raw in text files with for?
this my first idea code :
import glob
import pandas as pd
path = input("Insert location:")
file_list = glob.glob(path + "/*.txt")
txt_list = []
for file in file_list:
txt_list.append(pd.read_csv(file))
for file in file_list:
txt_list[-7::3]
excl_merged = pd.concat(txt_list, ignore_index=True)
excl_merged.to_excel('Total.xlsx', index=False) ]

Your code is incorrect. Here is a version that should work:
import glob
import pandas as pd
path = input("Insert location:")
file_list = glob.glob(path + "/*.txt")
df_list = []
for file in file_list:
df = pd.read_csv(file)
df_list.append(df.tail(3)) # last 3 rows from each file dataframe
excl_merged = pd.concat(df_list, ignore_index=True)
excl_merged.to_excel('Total.xlsx', index=False)
Explaination: tail() method takes the last several rows (provided as an argument) from a dataframe.

Extract File names from directory and classify on the basis of its extension in excel | Using PYTHON

I am trying to classify based on file extension from a local directory to excel sheet.
Like my input should be:
Directory path
My output should be:
excel sheet with different sheets based on extension.
Like if the input directory is having 5 .sh files, 8 .py file and so on.
On the basis of extension, sheets should be created with file names.
I am able to achieve the same but it is a bit hard coded.
Any help would be appreciated if it can be automated with hard code:
Below is the code i tried and its working fine:
import glob
import pandas as pd
path = r'<path_name>' #base path
files = glob.glob(path + '/**/*.*', recursive=True)
hql, hive, ksh, sh, csv, txt, sql,py = ([] for i in range(8))
for fpath in files:
chk_file = fpath.split('\\')
for file_name in chk_file:
if '.hql' in file_name:
print("Hql:",file_name)
comb = f'{file_name}'
hql.append(comb)
if '.hive' in file_name:
print(file_name)
comb = f'{file_name}'
hive.append(comb)
if '.ksh' in file_name:
print(file_name)
comb = f'{file_name}'
ksh.append(comb)
if '.sh' in file_name:
print(file_name)
comb = f'{file_name}'
sh.append(comb)
if '.sql' in file_name:
print(file_name)
comb = f'{file_name}'
sql.append(comb)
if '.txt' in file_name:
print(file_name)
comb = f'{file_name}'
txt.append(comb)
if '.csv' in file_name:
print(file_name)
comb = f'{file_name}'
csv.append(comb)
if '.py' in file_name:
print(file_name)
comb = f'{file_name}'
py.append(comb)
writer = pd.ExcelWriter(r'C:\Users\saurabh.arun.kumar\OneDrive - Accenture\Desktop\outfile2.xlsx',
engine='xlsxwriter')
new_hql = pd.DataFrame(hql,columns=['file'])
new_hive = pd.DataFrame(hive,columns=['file'])
new_sql = pd.DataFrame(sql,columns=['file'])
new_ksh = pd.DataFrame(ksh,columns=['file'])
new_txt = pd.DataFrame(txt,columns=['file'])
new_sh = pd.DataFrame(sh,columns=['file'])
new_csv = pd.DataFrame(csv,columns=['file'])
new_py = pd.DataFrame(py,columns=['file'])
new_hql.to_excel(writer, sheet_name='hql', index=False)
new_hive.to_excel(writer, sheet_name='hive', index=False)
new_sql.to_excel(writer, sheet_name='sql', index=False)
new_ksh.to_excel(writer, sheet_name='ksh', index=False)
new_csv.to_excel(writer, sheet_name='csv', index=False)
new_txt.to_excel(writer, sheet_name='txt', index=False)
new_sh.to_excel(writer, sheet_name='sh', index=False)
new_py.to_excel(writer, sheet_name='py', index=False)
writer.save()
writer.close()
print ("Executed")
This code will work with the extension provided in the code. And i want it should classify by its own reading the extension and created new sheets with the file names.
Hope i am able to explain the scenario.

You can split the extension from a files path by using
fname, fext = os.path.splitext("/what/ever/kind/of/file/this.is.txt")
Use that to create a dict of "ext" -> "list of files".
Use the dict to create n dataframes. Write them to excel.
If you only want certain extensions, filter the dict-keys to those you want:
import glob
import pandas as pd
from os import path
p = r'/redacted/location' # fix this to your path
files = glob.glob(p + '/**/*.*', recursive=True)
d = {}
i = 0 # used to redact my file names - you would simply store fn+fex
for f in files:
fn, fex = path.splitext(f)
# filter for extensions you want
if (fex in (".txt",".xlsx", ".docx") ):
# use d.setdefault(fex,[]).append(f) - I use something
# to blank out my file names here
# use collections.defaultdict to get a speed kick if needed
d.setdefault(fex,[]).append(f"file...{i}{fex}")
i += 1
# create single data frames per file extension from dictionary
dfs = []
for key,value in d.items():
df = pd.DataFrame({key:value})
dfs.append(df)
# do your excel writing here - use column header for sheet name etc.
for df in dfs:
print (df)
Output (files/names redacted):
.docx
0 file...0.docx
1 file...2.docx
2 file...3.docx
3 file...4.docx
4 file...5.docx
5 file...6.docx
6 file...7.docx
7 file...12.docx
8 file...13.docx
9 file...14.docx
10 file...15.docx
11 file...16.docx
.xlsx
0 file...1.xlsx
1 file...8.xlsx
2 file...9.xlsx
3 file...10.xlsx
4 file...11.xlsx
5 file...17.xlsx
You can then use the column header of each single DF to write your excel sheet - something akin to:
with pd.ExcelWriter('C:/temp/outfile2.xlsx') as writer:
for df in dfs:
df.to_excel(writer, sheet_name = df.columns[0])
should do it - can't test that right now.

csv_readout folder in wrong series

I want to read out data from different files in one folder. The files have the names: "1.csv", "2.csv", "3.csv" ... "96.csv". But instead of reading them in from the top to the bottom, it reads in "1.csv", "10.csv", "11.csv"... "2.csv", "21.csv".
Anyone knows how to fix this problem?
Thanks!
def csv_readout_folder(path):
os.chdir(path)
files = glob.glob(path +'/'+'*.csv')
all_data = pd.DataFrame()
for f in files:
data = csv_readout(path,f)
all_data = pd.concat([all_data, data])
return all_data

In you code for f in files: should read the files in the order they appear in the list. You can try sort functions but it may be easier to make a new list like this:
file_lst=[]
for k in range(1,97):
file_lst.append(f'{str(k)}.csv')
s1=pd.Series(file_lst)
def csv_readout_folder(path):
os.chdir(path)
files = glob.glob(path +'/'+'*.csv')
all_data = pd.DataFrame()
for f in list(s1[s1.isin(file_lst)]):
data = csv_readout(path,f)
all_data = pd.concat([all_data, data])
return all_data

You can do something like
files = [f'{path}/{i}.csv' for i in range(1, 22)]
instead of
files = glob.glob(path +'/'+'*.csv')
UPD:
def csv_readout_folder(path):
os.chdir(path)
n_files = len([el for el in os.scandir(path) if el.is_file()])
files = [f'{path}/{i}.csv' for i in range(1, n_files + 1)]
all_data = pd.DataFrame()
for f in files:
data = csv_readout(path,f)
all_data = pd.concat([all_data, data])
return all_data

Issues opening and reading txt files in a zip folder in Pandas

My current code is only opening one txt file in a zip folder when there are 4 txt files. I want to read in those txt files to a csv but unsure why it's not reading all of them. I am going to assume it's due to zip_csv_file = zip_csv_files[0] but I am unsure how I can modify my current code to parse all the .txt files in a given zip folder.
Code:
def custom_parse(self, response):
self.logger.info(response.url)
links = response.xpath("//a[contains(#href, '.zip')]/#href").getall()
for link in list(set(links)):
print(link)
local_path = self.download_file("https://www.sec.gov" + link)
zip_file = zipfile.ZipFile(local_path)
zip_csv_files = [file_name for file_name in zip_file.namelist() if file_name.endswith(".txt") and "pre" not in file_name]
zip_csv_file = zip_csv_files[0]
with zip_file.open(zip_csv_file, "r") as zip:
df = pd.read_csv(BytesIO(zip.read()), dtype=str, sep='\t')
df = self.standardized(df)
for k, row in df.iterrows():
yield dict(row)
Edit:
with zip_file.open(zip_csv_file, "r") as zip:
UnboundLocalError: local variable 'zip_csv_file' referenced before assignment

You can try like this if you want data from all the files into a single data frame:
path = <path to the zip file>
df = pd.concat(
[pd.read_csv(zipfile.ZipFile(path).open(text_file))
for text_file in zipfile.ZipFile(path).infolist()
if text_file.filename.endswith('.txt') and "pre" not in text_file.filename],
ignore_index=True
)
If you want each file as a different data frame:
path = <path to the zip file>
zip_file = zipfile.ZipFile(path)
dfs = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
for text_file in zip_file.infolist()
if text_file.filename.endswith('.txt') and "pre" not in text_file.filename}
This will give you a dict of data frames, with key as the filename

I have multiple dataframes in a list. How to print their names?

I am reading in multiple files and adding them to a list:
import pandas as pd
import glob
import ntpath
path = r'C:\Folder1\Folder2\Folder3\Folder3'
all_files = glob.glob(path + "/*.dat") #.dat files only
mylist = []
for filename in all_files:
name = ntpath.basename(filename) # for renaming the DF
name = name.replace('.dat','') # remove extension
try:
name = pd.read_csv(filename, sep='\t', engine='python')
mylist.append(name)
except:
print(f'File not read:{filename}')
Now I want to just display the DFs in this list.
This is what I've tried:
for thing in mylist:
print(thing.name)
AttributeError: 'DataFrame' object has no attribute 'name'
And
for item in mylist:
print(item)
But that just prints the whole DF content.

name = pd.read_csv(filename, sep='\t', engine='python')
mylist.append(name)
Here, name is a dataframe, not the name of your dataframe.
To add name to your dataframe, use
df = pd.read_csv(filename, sep='\t', engine='python')
df_name="Sample name"
mylist.append({'data':df, 'name':df_name})
>>> print(thing['name'])
Sample name

You can use a dictionary for that.
Writing to dict:
import pandas as pd
import glob
import ntpath
path = r'C:\Folder1\Folder2\Folder3\Folder3'
all_files = glob.glob(path + "/*.dat") #.dat files only
mydict = {}
for filename in all_files:
name = ntpath.basename(filename) # for renaming the DF
name = name.replace('.dat','') # remove extension
try:
mydict[name] = pd.read_csv(filename, sep='\t', engine='python')
except:
print(f'File not read:{filename}')
To read a df (say filename1) again:
df = mydict['filename1']
or to iterate over all df's in mydict:
for df in mydict.values():
# use df...
or:
for key in mydict:
print(key)
df = mydict[key]
# use df...

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to unzip and parse multiple files in Scrapy - python

Related

how do i select last raw in CSV files and import in to Excel?

Extract File names from directory and classify on the basis of its extension in excel | Using PYTHON

csv_readout folder in wrong series

Issues opening and reading txt files in a zip folder in Pandas

I have multiple dataframes in a list. How to print their names?

Categories

Resources