How can I make my code recognize the function defined above - python

I am trying to run through a set of CSV files in order to compile a results CSV file. I'm getting an error that my function is undefined for some reason. Can you tell me why? Thanks.
def response_amp(data):
import pandas as pd
import numpy as np
#putting in and cutting out unnecessary parts of the data
df = pd.read_csv('data.csv', encoding = 'utf-8')
df = df[:-1]
a = df.columns[df.columns.str.startswith('ยต')]
df = df[a]
dfd = df.drop(df.index[:30]) #dropping the section with no sample
#splitting the data into chunks so response values can be measure
df1d = dfd[:320] #first interval
df2d = dfd[330:470] #second interval
df3d = dfd[480:] #third interval
#rolling avg on each
df1r = df1d.rolling(5, win_type='gaussian').sum(std=4)
df2r = df2d.rolling(5, win_type='gaussian').sum(std=4)
df3r = df3d.rolling(5, win_type='gaussian').sum(std=4)
bsln_1 = df1r.iloc[3:6].mean()
bsln_2 = df2r.iloc[3:6].mean()
bsln_3 = df3r.iloc[3:6].mean()
response_1 = abs(df1r.min()-bsln_1)/bsln_1
response_2 = abs(df1r.min()-bsln_2)/bsln_2
response_3 = abs(df1r.min()-bsln_3)/bsln_3
response = response_1,response_2,response_3
return(response)
import os
directory =(r'file directory goes here')
response = []
for filename in os.listdir(directory):
if filename.endswith(".csv"):
response.append(response_amp(filename))
a = numpy.asarray(response)
numpy.savetxt("ks_response.csv", a, delimiter=",")
Thanks for the help.

Related

Combine Dataframes resulting from a for loop

I need a little help in appending the data thats getting generated out of the for loop below. Currenlty, im writing it to a dataframe in line "df = pd.DataFrame(li_row, columns=col_names)"
But when I have multiple files which starts from PAJ, I need the resulted Dataframe to be appended to one Dataframe.
Also, the below is a bits and pieces we gathered and amended to suit our need. please excuse me in case you feel its a mess. :)
import xmlschema
import os
import xml.etree.ElementTree as ET
import pandas as pd
dirpath = "C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls"
filenames = os.listdir("C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls")
# print(filenames)
for eachfile in filenames:
fname = eachfile[0:3]
print(dirpath+'\\'+eachfile)
if fname == 'PAJ':
xmlschema.validate(dirpath+'\\'+eachfile, 'PAJ.xsd')
tree = ET.parse(eachfile)
root = tree.getroot()
# Get AlertID from header
cols = {}
for header in root.findall(".//header/alertId"):
cols[header.tag] = header.text
# print(cols)
# get detailhr to be used for column header names
col_names = []
for DtHeader in root.findall(".//detailHdr/c"):
col_names.append(DtHeader.text)
# print(col_names)
# Get row and c
li_row = []
size = 0
for Data in root.findall(".//report/data"):
for child in Data:
# print(child.tag,child.text,len(Data))
li_row.append([])
for grandchild in child:
# print(grandchild.tag, grandchild.text,len(child))
li_row[size].append(grandchild.text)
size += 1
# print(li_row)
# create a dataframe with the col_names and row with c and alertid added at the end
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
print(df)
elif fname == 'PIE':
fileContent = ''
with open(dirpath + '\\' + eachfile) as filehandle:
fileContent = filehandle.read()
modFileContent = fileContent.replace("UTF-16", "UTF-8")
xmlschema.validate(modFileContent, 'PIE.xsd')
So if i were to change your current solution as little as possible I create a list of paj_data_frames and concatenate them once the script was done. Look at pd.concat documentation https://pandas.pydata.org/docs/user_guide/merging.html
paj_data_frames = []
for eachfile in filenames:
....
if fname == 'PAJ':
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
paj_data_frames.append(df)
....
final_df = pd.concat(paj_data_frames)

Looping through a folder in python returning blank dataframe

I am trying to grab all the files located in the folder but for some reason the code is return blank data frame. Below is my code:
def fwd_pick(p):
final2 = pd.DataFrame()
excel_files2 = glob.glob(p + '.xlsx')
for f in excel_files2:
df2 = pd.read_excel(f)
final2 = final2.append(df2)
return final2
# Getting the forward pick locations
fwdpick = fwd_pick('C:/Users/abc/newfolder/')
Any help will be appreciated.
I realized that I missed * in glob.glob that's why it was not reading properly. It works fine now.
def fwd_pick(p):
final2 = pd.DataFrame()
excel_files2 = glob.glob(p + '*.xlsx')
for f in excel_files2:
df2 = pd.read_excel(f)
final2 = final2.append(df2)
return final2
# Getting the forward pick locations
fwdpick = fwd_pick('C:/Users/abc/newfolder/')

Unable to convert pandas dataframe to csv file

I want to create a dataset.csv file from the various raw files within my input_path. However, my code didn't seem to generate the csv file.
import os
import pandas as pd
from zipfile import ZipFile
import cv2
import re
import csv
from sklearn.dummy import DummyClassifier
input_path = "../input_data/"
class RawToCSV:
def __init__(self, path_, df):
self.df = df
self.measurement_df = None
self.cls = None
self.path_ = path_
def raw_file_processing(self, path_):
# Open all the subfolders within path
for root, dirs, files in os.walk(path_):
for file in files:
with open(os.path.join(root, file), "r") as data:
df = pd.read_csv(data)
# Create the ID series by concatenating columns 1-3
df = df.assign(ID=df[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
df = df.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# The class info is the tile_num (i.e, column 3) - TODO: Check if class info is tile_num
cls_col = df.iloc[2]
# Dummy-code the classes
cls = pd.get_dummies(cls_col)
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in df[9:]:
if re.search(r"(Blank|Empty)$", col):
background = col
else:
line = col.readline()
for data in line:
norm_data = data/ background
self.measurement_df = np.log2(norm_data)
return self.df["ID"], cls, self.measurement_df
def dataset_csv(self):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
ids = self.df["ID"]
id_col = ids.to_frame()
cls_col = self.cls.to_frame()
frames = [id_col, cls_col, self.measurement_df]
dataset_df = pd.concat(frames)
data_csv = dataset_df.to_csv(r"../input_data/dataset.csv")
return data_csv
dataset = RawToCSV.dataset_csv(input_path)
Desired output dataset.csv
ID
Class
measurement_A
measurement_B
Sample1
value
value
value
Sample2
value
value
value
Not sure if you need to treat your url path as a row string. You should simply try:
data_csv = dataset_df.to_csv("../input_data/dataset.csv")
Ps : and as a test, start by saving your csv in the same folder where your script is:
data_csv = dataset_df.to_csv("dataset.csv")

Is there a faster, more efficient way to process my data?

Further to a post I made a couple of weeks ago, I'm reading rows from a spreadsheet (nearly 215,000) and attempting to match them with text files contained in in a sub-directory. On average the number of text files files contained in the sub-directory is 14000. Although my code is working, it is taking an inordinate amount of time to copy the matched files to a second sub-directory. At this rate it's going to be end of August before the job is complete (average processing time is six hours)
Is there a way to improve the efficiency of this algorithm, or indeed is there a better way? My code is below
regards
import glob
import os,sys
import csv
import shutil
import pandas as pd
import fnmatch
import string
import xlrd
from os import listdir
from os.path import isfile
MDA_Path = 'D:/1994_QTR3' # contains Loughram and MacDonald 10-K files for QTR3
MDA_Path_2 = 'D:/1994_QTR4' # Contains L&M 10-K files for QTR4
MDA_Path_3 = 'D:/1995_QTR1'
MDA_Path_4 = 'D:/1995_QTR2'
MDA_Path_5 = 'D:/1995_QTR3'
MDA_Path_6 = 'D:/1995_QTR4'
MDA_Path_7 = 'D:/1996_QTR1'
MDA_Path_8 = 'D:/1996_QTR2'
MDA_Path_9 = 'D:/1996_QTR3'
MDA_Path_10 = 'D:/1996_QTR4'
MDA_Path_11 = 'D:/1997_QTR1'
MDA_Path_12 = 'D:/1997_QTR2'
MDA_Path_13 = 'D:/1997_QTR3'
MDA_Path_14 = 'D:/1997_QTR4'
MDA_Path_15 = 'D:/1998/QTR1'
MDA_Path_16 = 'D:/1998/QTR2'
MDA_Path_17 = 'D:/1998/QTR3'
MDA_Path_18 = 'D:/1998/QTR4'
MDA_Path_19 = 'D:/1999/QTR1'
MDA_Path_20 = 'D:/1999/QTR2'
MDA_Path_21 = 'D:/1999/QTR3'
MDA_Path_22 = 'D:/1999/QTR4'
MDA_Path_23 = 'D:/2000/QTR1'
MDA_Path_24 = 'D:/2000/QTR2'
MDA_Path_25 = 'D:/2000/QTR3'
MDA_Path_26 = 'D:/2000/QTR4'
MDA_Path_27 = 'D:/2001/QTR1'
MDA_Path_28 = 'D:/2001/QTR2'
MDA_Path_29 = 'D:/2001/QTR3'
MDA_Path_30 = 'D:/2001/QTR4'
MDA_Path_31 = 'D:/2002/QTR1'
MDA_Path_32 = 'D:/2002/QTR2'
MDA_Path_33 = 'D:/2002/QTR3'
MDA_Path_34 = 'D:/2002/QTR4'
MDA_Target_List = r'D:/PhD_Data/Wenrui_Filing_list' # stores wenruis data
MDA_For_Parsing_1994_QTR3 = 'D:/Required_MDA_1994_QTR3' # will hold all 10-Ks from wenrui's spreadsheet once detected
MDA_For_Parsing_1994_QTR4 = 'D:/Required_MDA_1994_QTR4'
MDA_For_Parsing_1995_QTR1 = 'D:/Required_MDA_1995_QTR1'
MDA_For_Parsing_1995_QTR2 = 'D:/Required_MDA_1995_QTR2'
MDA_For_Parsing_1995_QTR3 = 'D:/Required_MDA_1995_QTR3'
MDA_For_Parsing_1995_QTR4 = 'D:/Required_MDA_1995_QTR4'
MDA_For_Parsing_1996_QTR1 = 'D:/Required_MDA_1996_QTR1'
MDA_For_Parsing_1996_QTR2 = 'D:/Required_MDA_1996_QTR2'
MDA_For_Parsing_1996_QTR3 = 'D:/Required_MDA_1996_QTR3'
MDA_For_Parsing_1996_QTR4 = 'D:/Required_MDA_1996_QTR4'
MDA_For_Parsing_1997_QTR1 = 'D:/Required_MDA_1997_QTR1'
MDA_For_Parsing_1997_QTR2 = 'D:/Required_MDA_1997_QTR2'
MDA_For_Parsing_1997_QTR3 = 'D:/Required_MDA_1997_QTR3'
MDA_For_Parsing_1997_QTR4 = 'D:/Required_MDA_1997_QTR4'
MDA_For_Parsing_1998_QTR1 = 'D:/Required_MDA_1998_QTR1'
MDA_For_Parsing_1998_QTR2 = 'D:/Required_MDA_1998_QTR2'
MDA_For_Parsing_1998_QTR3 = 'D:/Required_MDA_1998_QTR3'
MDA_For_Parsing_1998_QTR4 = 'D:/Required_MDA_1998_QTR4'
MDA_For_Parsing_1999_QTR1 = 'D:/Required_MDA_1999_QTR1'
MDA_For_Parsing_1999_QTR2 = 'D:/Required_MDA_1999_QTR2'
MDA_For_Parsing_1999_QTR3 = 'D:/Required_MDA_1999_QTR3'
MDA_For_Parsing_1999_QTR4 = 'D:/Required_MDA_1999_QTR4'
MDA_For_Parsing_2000_QTR1 = 'D:/Required_MDA_2000_QTR1'
MDA_For_Parsing_2000_QTR2 = 'D:/Required_MDA_2000_QTR2'
MDA_For_Parsing_2000_QTR3 = 'D:/Required_MDA_2000_QTR3'
MDA_For_Parsing_2000_QTR4 = 'D:/Required_MDA_2000_QTR4'
MDA_For_Parsing_2001_QTR1 = 'D:/Required_MDA_2001_QTR1'
MDA_For_Parsing_2001_QTR2 = 'D:/Required_MDA_2001_QTR2'
MDA_For_Parsing_2001_QTR3 = 'D:/Required_MDA_2001_QTR3'
MDA_For_Parsing_2001_QTR4 = 'D:/Required_MDA_2001_QTR4'
MDA_FOR_Parsing_2002_QTR1 = 'D:/Required_MDA_2002_QTR1'
MDA_FOR_Parsing_2002_QTR2 = 'D:/Required_MDA_2002_QTR2'
MDA_FOR_Parsing_2002_QTR3 = 'D:/Required_MDA_2002_QTR3'
MDA_FOR_Parsing_2002_QTR4 = 'D:/Required_MDA_2002_QTR4'
# open the csv file and extract the column containing the location of the text file(s)
datas = pd.read_excel(r'D:/PhD_Data/Wenrui_Filing_list/1994-2017filingslist_Wenrui_13Jul2020.xlsx')
df = pd.DataFrame(datas, columns = ['FILE_NAME']) # extract the data contained in FILE_NAME column
df['FILE_NAME'] = df['FILE_NAME'].str[26:] # remove the first 26 characters which contain the edgar drive info
df['FILE_NAME'] = df['FILE_NAME'].str.strip() # remove all leading and trailing
file_length = len(df) # count number of files in Wenrui's list (will need this later to loop through all occurrences)
dirs = os.listdir(MDA_Path_32)
# dirs1 = os.listdir(MDA_Path_3)
for x in range(file_length):
for file in dirs:
# if file == df['FILE_NAME'][x]:
if df['FILE_NAME'][x] in file:
print(file)
shutil.copy(MDA_Path_32 + '/' + file, MDA_FOR_Parsing_2002_QTR2) # Move it to QTR directory```

How do I apply this code to multiple csv?

could anyone advise me how to apply this code to several csv in one folder? Then, save the modified csv to another folder and each separately? In short, I need to automate it.
I need to automatically load the csv file, execute the code, save the newly modified csv file, and then repeat it to the next csv file in the folder.
import pandas as pd
import datetime as dt
import numpy as np
from numpy import nan as Nan
path = "C://Users//Zemi4//Desktop//csv//A-001.csv"
df = pd.read_csv(path,delimiter=";")
df['ta'] = pd.to_numeric(df['ta'])
df['tw'] = pd.to_numeric(df['tw'])
df["time_str"] = [dt.datetime.strptime(d, "%d.%m.%Y %H:%M:%S") for d in df["time"]]
df["time_str"] = [d.date() for d in df["time_str"]]
df["time_str"] = pd.to_datetime(df["time_str"])
df["time_zaokrouhleny"]=df["time_str"]
def analyza(pozadovane_data):
new_list = []
new_df = pd.DataFrame(new_list)
new_df=df.loc[df["time_str"] == pozadovane_data,["ta","tw", "zone", "time_zaokrouhleny"]]
counter = new_df.ta.count()
if counter < 24:
for i in range(counter,24):
new_df.loc[i] = [Nan for n in range(4)]
new_df["ta"]= new_df.ta.fillna(0)
new_df["tw"] = new_df.tw.fillna(0)
new_df["zone"] = new_df.zone.fillna(0)
new_df["time_zaokrouhleny"]=new_df.time_zaokrouhleny.fillna(new_df.time_zaokrouhleny.min())
elif counter > 24:
counter_list = list(range(24,counter))
new_df = new_df.drop(new_df.index[counter_list])
new_df["time_oprava"] = [dt.datetime.combine(d.date(),dt.time(1,0)) for d in new_df["time_zaokrouhleny"]]
s = 0
cas_list = []
for d in new_df["time_oprava"]:
d =d + dt.timedelta(hours=s)
#print(d)
#print(s)
cas_list.append(d)
s = s + 1
se = pd.Series(cas_list)
new_df['time_oprava'] = se.values
new_df['Validace'] = (new_df['ta'] != 0) & (new_df['tw'] != 0)
new_df['Rozdil'] = new_df['ta'] - new_df['tw']
new_df.rename(columns={"ta": "Skutecna teplota", "tw": "Pozadovana teplota", "time_oprava": "Cas", "zone": "Mistnost"}, inplace = True)
new_df.index = new_df['Cas']
return new_df
start = dt.datetime(2010,10,6)
end = dt.datetime(2010,12,27)
date_range = []
date_range = [start + dt.timedelta(days=x) for x in range(0,(end-start).days)]
new_list = []
vysledek_df =pd.DataFrame(new_list)
for d in date_range:
pom = analyza(d)
vysledek_df = vysledek_df.append(pom,ignore_index=True)
vysledek_df.pop('time_zaokrouhleny')
vysledek_df.to_csv('C://Users//Zemi4//Desktop//zpr//A-001.csv', encoding='utf-8', index=False)
The code itself works correctly. Thank you for your advice.
Simplest way is to use glob. Just give the folder_path and output_path as per your requirements and use the sample code below. I commented the code to help you understand the code.
import os
import glob
folder_path = 'path/to/folder/' # path to folder containing .csv files
output_path = 'path/to/output/folder/' # path to output folder
for file in glob.glob(folder_path + '*.csv'): # only loads .csv files from the folder
df = pd.read_csv(file, delimiter=";") # read .csv file
# Do something
df.to_csv(output_path + 'modified_' + str(os.path.basename(file)), encoding='utf-8', index=False) # saves modified .csv file to output_path
You want to use os.listdir() to find the contents of the directory, then parameterize the file path in a new function. You can then loop over a list of directories retrieved via os.walk() and run the function for each one.
import os
def run(file_directory):
filelist = os.listdir(file_directory)
for path in filelist:
df = pd.read_csv(path,delimiter=";")
# etc.
df.to_csv(os.path.join(file_directory, 'output.csv'))
If you need to create a new directory, you can use os.mkdir(newpath)
Can you still advise on how to parameterize the function?

Categories

Resources