csv_readout folder in wrong series - python

I want to read out data from different files in one folder. The files have the names: "1.csv", "2.csv", "3.csv" ... "96.csv". But instead of reading them in from the top to the bottom, it reads in "1.csv", "10.csv", "11.csv"... "2.csv", "21.csv".
Anyone knows how to fix this problem?
Thanks!
def csv_readout_folder(path):
os.chdir(path)
files = glob.glob(path +'/'+'*.csv')
all_data = pd.DataFrame()
for f in files:
data = csv_readout(path,f)
all_data = pd.concat([all_data, data])
return all_data

In you code for f in files: should read the files in the order they appear in the list. You can try sort functions but it may be easier to make a new list like this:
file_lst=[]
for k in range(1,97):
file_lst.append(f'{str(k)}.csv')
s1=pd.Series(file_lst)
def csv_readout_folder(path):
os.chdir(path)
files = glob.glob(path +'/'+'*.csv')
all_data = pd.DataFrame()
for f in list(s1[s1.isin(file_lst)]):
data = csv_readout(path,f)
all_data = pd.concat([all_data, data])
return all_data

You can do something like
files = [f'{path}/{i}.csv' for i in range(1, 22)]
instead of
files = glob.glob(path +'/'+'*.csv')
UPD:
def csv_readout_folder(path):
os.chdir(path)
n_files = len([el for el in os.scandir(path) if el.is_file()])
files = [f'{path}/{i}.csv' for i in range(1, n_files + 1)]
all_data = pd.DataFrame()
for f in files:
data = csv_readout(path,f)
all_data = pd.concat([all_data, data])
return all_data

Related

How to unzip and parse multiple files in Scrapy

I am trying to parse a list of .txt files within a zip folder but it's only parsing one file from that list
Code:
def custom_parse(self, response):
self.logger.info(response.url)
links = response.xpath("//a[contains(#href, '.zip')]/#href").getall()
for link in list(set(links)):
print(link)
local_path = self.download_file("https://www.sec.gov" + link)
zip_file = zipfile.ZipFile(local_path)
zip_csv_files = [file_name for file_name in zip_file.namelist() if file_name.endswith(".txt") and "pre" not in file_name]
zip_csv_file = zip_csv_files[0]
with zip_file.open(zip_csv_file, "r") as zip:
# df = pd.read_csv(BytesIO(zip.read()), dtype=object)
df = pd.read_csv(zip, dtype=object, header=None, sep='delimiter')
df = self.standardized(df)
for k, row in df.iterrows():
yield dict(row)
def standardized(self, df):
# df.columns = [col.lower().strip().replace(" ", "_") for col in df.columns]
df = df.fillna('')
return df
I am going to assume it's due to zip_csv_file = zip_csv_files[0] but I am unsure how I can modify my current code to parse all the .txt files in a given zip folder.
You already pull out all the .txt files with your list comprehension, so just read those in a loop and concatenate them. This is untested, but should be close
replace the appropriate section of your code with this:
UPDATE:
zip_file = zipfile.ZipFile(local_path)
text_files = zip_file.infolist()
df_list =[]
for file_name in text_files:
if file_name.filename.endswith(".txt") and "pre" not in file_name.filename:
df_list.append(pd.read_csv(zip_file(open(file_name.filename)), dtype=object, header=None, sep='delimiter'))
df = pd.concat(df_list)
df = self.standardized(df)

Loop over common files in multiple folders

How could I find files with the same filename in multiple folders, and then perform the same operations?
def findCommonDeep(path1, path2):
return set.intersection(*(set(os.path.relpath(os.path.join(root, file), path) for root, _, files in os.walk(path) for file in files) for path in (path1, path2)))
lista = []
for x in [2010, 2017, 2020]:
if x > 2015:
filepath = rf'My Documents\Analysis\{x}\LateAnalysis\*.csv'
else:
filepath = rf'My Documents\Analysis\{x}\Early_Analysis\*.csv'
fname = os.path.basename(filepath, filepath)
findCommonDeep(fname)
for file in glob.glob(filename_common):
df = pd.read_csv(file)
df = df.set_index('date')
lista.append(df)
As you've tagged pandas, let's use pandas and pathlib to return a dictionary of files with similar names:
from pathlib import Path
import pandas as pd
def return_similair_files(start_dir : str) -> dict:
all_files = Path(start_dir).rglob('*.csv')
df = pd.DataFrame({'files' : all_files})
df['name'] = df['files'].apply(lambda x : x.name) #pathlib method.
return df.groupby('name')['files'].agg(list).to_dict()
This will return a dictionary of files, like so:
{'file_1.csv' : [list_of_paths]}
You can then operate on them as you need.

I trouble in how do parse multiple xml file and process it as dataframe in Python

I want parse multi xml file into dataframe. There are same xpath.
I have used element tree and os Python library.It can parse all the files, but it print out empty dataframe. However if code without multiple file, it can work properly.
mypath = r'C:\Users\testFile'
files = [path.join(mypath, f) for f in listdir(mypath) if f.endswith('.xml')]
for file in files:
xtree = et.parse(file)
xroot = xtree.getroot()
df_cols=['value']
out_xml=pd.DataFrame(columns=df_cols)
for node in xroot.findall(r'./Group[1]/Details/Section[3]/Subreport/Group/Group[1]/Details/Section/Field'):
name = node.attrib.get('Name')
value = node.find('Value').text
out_xml = out_xml.append(pd.Series([value],index=df_cols),ignore_index=True)
df = pd.DataFrame(np.reshape(out_xml.values, (-1, 4)))
If you need a single dataframe with all data,you need to concat each dataframe to one main dataframe
mypath = r'C:\testFile'
files = [path.join(mypath, f) for f in listdir(mypath) if f.endswith('.xml')]
mainDF = pd.DataFrame()
for file in files:
xtree = et.parse(file)
xroot = xtree.getroot()
df_cols=['value']
out_xml=pd.DataFrame(columns=df_cols)
for node in xroot.findall(r'./Group[1]/Details/Section[3]/Subreport/Group/Group[1]/Details/Section/Field'):
name = node.attrib.get('Name')
value = node.find('Value').text
out_xml = out_xml.append(pd.Series([value],index=df_cols),ignore_index=True)
df = pd.DataFrame(np.reshape(out_xml.values, (-1, 4)))
mainDF = pd.concat([mainDF,df])
mainDF.to_csv("filename.csv")

How do I apply this code to multiple csv?

could anyone advise me how to apply this code to several csv in one folder? Then, save the modified csv to another folder and each separately? In short, I need to automate it.
I need to automatically load the csv file, execute the code, save the newly modified csv file, and then repeat it to the next csv file in the folder.
import pandas as pd
import datetime as dt
import numpy as np
from numpy import nan as Nan
path = "C://Users//Zemi4//Desktop//csv//A-001.csv"
df = pd.read_csv(path,delimiter=";")
df['ta'] = pd.to_numeric(df['ta'])
df['tw'] = pd.to_numeric(df['tw'])
df["time_str"] = [dt.datetime.strptime(d, "%d.%m.%Y %H:%M:%S") for d in df["time"]]
df["time_str"] = [d.date() for d in df["time_str"]]
df["time_str"] = pd.to_datetime(df["time_str"])
df["time_zaokrouhleny"]=df["time_str"]
def analyza(pozadovane_data):
new_list = []
new_df = pd.DataFrame(new_list)
new_df=df.loc[df["time_str"] == pozadovane_data,["ta","tw", "zone", "time_zaokrouhleny"]]
counter = new_df.ta.count()
if counter < 24:
for i in range(counter,24):
new_df.loc[i] = [Nan for n in range(4)]
new_df["ta"]= new_df.ta.fillna(0)
new_df["tw"] = new_df.tw.fillna(0)
new_df["zone"] = new_df.zone.fillna(0)
new_df["time_zaokrouhleny"]=new_df.time_zaokrouhleny.fillna(new_df.time_zaokrouhleny.min())
elif counter > 24:
counter_list = list(range(24,counter))
new_df = new_df.drop(new_df.index[counter_list])
new_df["time_oprava"] = [dt.datetime.combine(d.date(),dt.time(1,0)) for d in new_df["time_zaokrouhleny"]]
s = 0
cas_list = []
for d in new_df["time_oprava"]:
d =d + dt.timedelta(hours=s)
#print(d)
#print(s)
cas_list.append(d)
s = s + 1
se = pd.Series(cas_list)
new_df['time_oprava'] = se.values
new_df['Validace'] = (new_df['ta'] != 0) & (new_df['tw'] != 0)
new_df['Rozdil'] = new_df['ta'] - new_df['tw']
new_df.rename(columns={"ta": "Skutecna teplota", "tw": "Pozadovana teplota", "time_oprava": "Cas", "zone": "Mistnost"}, inplace = True)
new_df.index = new_df['Cas']
return new_df
start = dt.datetime(2010,10,6)
end = dt.datetime(2010,12,27)
date_range = []
date_range = [start + dt.timedelta(days=x) for x in range(0,(end-start).days)]
new_list = []
vysledek_df =pd.DataFrame(new_list)
for d in date_range:
pom = analyza(d)
vysledek_df = vysledek_df.append(pom,ignore_index=True)
vysledek_df.pop('time_zaokrouhleny')
vysledek_df.to_csv('C://Users//Zemi4//Desktop//zpr//A-001.csv', encoding='utf-8', index=False)
The code itself works correctly. Thank you for your advice.
Simplest way is to use glob. Just give the folder_path and output_path as per your requirements and use the sample code below. I commented the code to help you understand the code.
import os
import glob
folder_path = 'path/to/folder/' # path to folder containing .csv files
output_path = 'path/to/output/folder/' # path to output folder
for file in glob.glob(folder_path + '*.csv'): # only loads .csv files from the folder
df = pd.read_csv(file, delimiter=";") # read .csv file
# Do something
df.to_csv(output_path + 'modified_' + str(os.path.basename(file)), encoding='utf-8', index=False) # saves modified .csv file to output_path
You want to use os.listdir() to find the contents of the directory, then parameterize the file path in a new function. You can then loop over a list of directories retrieved via os.walk() and run the function for each one.
import os
def run(file_directory):
filelist = os.listdir(file_directory)
for path in filelist:
df = pd.read_csv(path,delimiter=";")
# etc.
df.to_csv(os.path.join(file_directory, 'output.csv'))
If you need to create a new directory, you can use os.mkdir(newpath)
Can you still advise on how to parameterize the function?

Pandas Error - Appending multiple files into one

I have a set of files that do not have any extension. They are currently stored in a folder that is referenced by this variable "allFiles".
allFiles = glob.glob(base2 + "/*")
I am trying to add an extension to each of the files in allFiles. Add .csv to the file name. I do it using the below code:
for file in allFiles:
os.rename(os.path.join(base2, file), os.path.join(base2, file+'.csv'))
Next I try to append each of these csv files into one as per the below code.
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, header=None,delim_whitespace = True, error_bad_lines=False)
list_.append(df)
except pd.errors.EmptyDataError:
continue
When I run the above code, I get an error stating one of the files do not exist.
Error : FileNotFoundError: File b'/Users/base2/file1' does not exist
But file1 has now been renamed to file1.csv
Could anyone advice as to where am I going wrong in the above. Thanks
Update:
allFiles = glob.glob(base2 + "/*")
print(allFiles)
list_ = []
print(list_)
allFiles = [x + '.csv' for x in allFiles]
print(allFiles)
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, header=None)
list_.append(df)
except pd.errors.EmptyDataError:
continue
Error : FileNotFoundError: File b'/Users/base2/file1.csv' does not exist
Before running your loop, do:
EDIT for clarity:
for file in allFiles:
os.rename(os.path.join(base2, file), os.path.join(base2, file+'.csv'))
###What you're adding###
allFiles = [x+'.csv' for x in allFiles]
########################
for file_ in allFiles:
try:
Basically, the problem is that you're changing the file names, but you're not changing the strings in your list to reflect the new file names. You can see this if you print allFiles. The above will make the necessary change for you.

Categories

Resources