Further to a post I made a couple of weeks ago, I'm reading rows from a spreadsheet (nearly 215,000) and attempting to match them with text files contained in in a sub-directory. On average the number of text files files contained in the sub-directory is 14000. Although my code is working, it is taking an inordinate amount of time to copy the matched files to a second sub-directory. At this rate it's going to be end of August before the job is complete (average processing time is six hours)
Is there a way to improve the efficiency of this algorithm, or indeed is there a better way? My code is below
regards
import glob
import os,sys
import csv
import shutil
import pandas as pd
import fnmatch
import string
import xlrd
from os import listdir
from os.path import isfile
MDA_Path = 'D:/1994_QTR3' # contains Loughram and MacDonald 10-K files for QTR3
MDA_Path_2 = 'D:/1994_QTR4' # Contains L&M 10-K files for QTR4
MDA_Path_3 = 'D:/1995_QTR1'
MDA_Path_4 = 'D:/1995_QTR2'
MDA_Path_5 = 'D:/1995_QTR3'
MDA_Path_6 = 'D:/1995_QTR4'
MDA_Path_7 = 'D:/1996_QTR1'
MDA_Path_8 = 'D:/1996_QTR2'
MDA_Path_9 = 'D:/1996_QTR3'
MDA_Path_10 = 'D:/1996_QTR4'
MDA_Path_11 = 'D:/1997_QTR1'
MDA_Path_12 = 'D:/1997_QTR2'
MDA_Path_13 = 'D:/1997_QTR3'
MDA_Path_14 = 'D:/1997_QTR4'
MDA_Path_15 = 'D:/1998/QTR1'
MDA_Path_16 = 'D:/1998/QTR2'
MDA_Path_17 = 'D:/1998/QTR3'
MDA_Path_18 = 'D:/1998/QTR4'
MDA_Path_19 = 'D:/1999/QTR1'
MDA_Path_20 = 'D:/1999/QTR2'
MDA_Path_21 = 'D:/1999/QTR3'
MDA_Path_22 = 'D:/1999/QTR4'
MDA_Path_23 = 'D:/2000/QTR1'
MDA_Path_24 = 'D:/2000/QTR2'
MDA_Path_25 = 'D:/2000/QTR3'
MDA_Path_26 = 'D:/2000/QTR4'
MDA_Path_27 = 'D:/2001/QTR1'
MDA_Path_28 = 'D:/2001/QTR2'
MDA_Path_29 = 'D:/2001/QTR3'
MDA_Path_30 = 'D:/2001/QTR4'
MDA_Path_31 = 'D:/2002/QTR1'
MDA_Path_32 = 'D:/2002/QTR2'
MDA_Path_33 = 'D:/2002/QTR3'
MDA_Path_34 = 'D:/2002/QTR4'
MDA_Target_List = r'D:/PhD_Data/Wenrui_Filing_list' # stores wenruis data
MDA_For_Parsing_1994_QTR3 = 'D:/Required_MDA_1994_QTR3' # will hold all 10-Ks from wenrui's spreadsheet once detected
MDA_For_Parsing_1994_QTR4 = 'D:/Required_MDA_1994_QTR4'
MDA_For_Parsing_1995_QTR1 = 'D:/Required_MDA_1995_QTR1'
MDA_For_Parsing_1995_QTR2 = 'D:/Required_MDA_1995_QTR2'
MDA_For_Parsing_1995_QTR3 = 'D:/Required_MDA_1995_QTR3'
MDA_For_Parsing_1995_QTR4 = 'D:/Required_MDA_1995_QTR4'
MDA_For_Parsing_1996_QTR1 = 'D:/Required_MDA_1996_QTR1'
MDA_For_Parsing_1996_QTR2 = 'D:/Required_MDA_1996_QTR2'
MDA_For_Parsing_1996_QTR3 = 'D:/Required_MDA_1996_QTR3'
MDA_For_Parsing_1996_QTR4 = 'D:/Required_MDA_1996_QTR4'
MDA_For_Parsing_1997_QTR1 = 'D:/Required_MDA_1997_QTR1'
MDA_For_Parsing_1997_QTR2 = 'D:/Required_MDA_1997_QTR2'
MDA_For_Parsing_1997_QTR3 = 'D:/Required_MDA_1997_QTR3'
MDA_For_Parsing_1997_QTR4 = 'D:/Required_MDA_1997_QTR4'
MDA_For_Parsing_1998_QTR1 = 'D:/Required_MDA_1998_QTR1'
MDA_For_Parsing_1998_QTR2 = 'D:/Required_MDA_1998_QTR2'
MDA_For_Parsing_1998_QTR3 = 'D:/Required_MDA_1998_QTR3'
MDA_For_Parsing_1998_QTR4 = 'D:/Required_MDA_1998_QTR4'
MDA_For_Parsing_1999_QTR1 = 'D:/Required_MDA_1999_QTR1'
MDA_For_Parsing_1999_QTR2 = 'D:/Required_MDA_1999_QTR2'
MDA_For_Parsing_1999_QTR3 = 'D:/Required_MDA_1999_QTR3'
MDA_For_Parsing_1999_QTR4 = 'D:/Required_MDA_1999_QTR4'
MDA_For_Parsing_2000_QTR1 = 'D:/Required_MDA_2000_QTR1'
MDA_For_Parsing_2000_QTR2 = 'D:/Required_MDA_2000_QTR2'
MDA_For_Parsing_2000_QTR3 = 'D:/Required_MDA_2000_QTR3'
MDA_For_Parsing_2000_QTR4 = 'D:/Required_MDA_2000_QTR4'
MDA_For_Parsing_2001_QTR1 = 'D:/Required_MDA_2001_QTR1'
MDA_For_Parsing_2001_QTR2 = 'D:/Required_MDA_2001_QTR2'
MDA_For_Parsing_2001_QTR3 = 'D:/Required_MDA_2001_QTR3'
MDA_For_Parsing_2001_QTR4 = 'D:/Required_MDA_2001_QTR4'
MDA_FOR_Parsing_2002_QTR1 = 'D:/Required_MDA_2002_QTR1'
MDA_FOR_Parsing_2002_QTR2 = 'D:/Required_MDA_2002_QTR2'
MDA_FOR_Parsing_2002_QTR3 = 'D:/Required_MDA_2002_QTR3'
MDA_FOR_Parsing_2002_QTR4 = 'D:/Required_MDA_2002_QTR4'
# open the csv file and extract the column containing the location of the text file(s)
datas = pd.read_excel(r'D:/PhD_Data/Wenrui_Filing_list/1994-2017filingslist_Wenrui_13Jul2020.xlsx')
df = pd.DataFrame(datas, columns = ['FILE_NAME']) # extract the data contained in FILE_NAME column
df['FILE_NAME'] = df['FILE_NAME'].str[26:] # remove the first 26 characters which contain the edgar drive info
df['FILE_NAME'] = df['FILE_NAME'].str.strip() # remove all leading and trailing
file_length = len(df) # count number of files in Wenrui's list (will need this later to loop through all occurrences)
dirs = os.listdir(MDA_Path_32)
# dirs1 = os.listdir(MDA_Path_3)
for x in range(file_length):
for file in dirs:
# if file == df['FILE_NAME'][x]:
if df['FILE_NAME'][x] in file:
print(file)
shutil.copy(MDA_Path_32 + '/' + file, MDA_FOR_Parsing_2002_QTR2) # Move it to QTR directory```
Related
I'm trying to make a project where I create a graph from a python project.
I have this code
import os
import sys
import re
import networkx as nx
from pyvis.physics import Physics
from radon.visitors import ComplexityVisitor
from pyvis.network import Network
rootDir ="/home/ask/Git/Zeeguu-API"
depth = int(sys.argv[1])
class directory:
def __init__(self,path, ParentDir = None,ChildrenDirs = [] , PyChildren = []) -> None:
self.path = path
self.parentDir = ParentDir
self.pyChildren = ChildrenDirs
self.pyChildren = PyChildren
def getComplexityoffile(file : str):
f = open(file, "r")
s = f.read()
return ComplexityVisitor.from_code(s).total_complexity
def getParentOfDir(dir: str):
cutlast = dir.split("/")[:-1]
join = "/".join(cutlast)
if join:
return join
else:
return "./"
def extract_importandClass_from_line(unline):
x = re.search("^import (\S+)", unline)
x = re.search("^from (\S+)", unline)
return x.group(1)#, c.group(1).split('(')[0]
def getimportsforfile(file):
lines = [line for line in open(file)]
classes = []
all_imports = []
for line in lines:
try:
imports = extract_importandClass_from_line(line)
tmp = imports.rsplit('.',1)
importEnd = tmp[-1]
# importsimports
importsFormatted = imports.replace('.', '/')
finalimport = importsFormatted[1:] if importsFormatted.startswith('/') else importsFormatted
all_imports.append(importsFormatted)
except:
continue
return all_imports
NodesAndComplexity = {} # (node/complexity in folder)
# ting jeg vil bruge til at holdestyr på dependencies
Map_Dirs_And_Files_To_Displaybledirs = {}
pythonFile_to_imports = {} # (Fille importing, file/dir imported)
dirsForDisplay = set()
# mapping files to parent directories
parenDirToChildDir = {} # (parent, [list of children])
G = nx.DiGraph()
isRoot = True
for root, dirs, files in os.walk(rootDir):
pyfiles = list(filter(lambda a : a.endswith('.py'), files))
thisDir = root.replace(rootDir, '')
splitDIR = thisDir[1:].split("/")[:depth]
if not isRoot:
displayableDir = "/" + "/".join(splitDIR)
else:
displayableDir = "./"
isRoot = False
# if there is python files on this directory
referentialDIr = thisDir[1:] if thisDir.startswith('/') else thisDir
Map_Dirs_And_Files_To_Displaybledirs[referentialDIr] = displayableDir
if (pyfiles):
accumulateComplexity = 0
for f in pyfiles:
filepath = root + "/"+ f
imports = getimportsforfile(filepath)
logFile = thisDir + "/" + f[:-3]
accumulateComplexity = accumulateComplexity + getComplexityoffile(filepath)
removedslashFromLogfile = logFile[1:] if logFile.startswith('/') else logFile
Map_Dirs_And_Files_To_Displaybledirs[removedslashFromLogfile] = displayableDir
pythonFile_to_imports[removedslashFromLogfile] = imports
if displayableDir not in NodesAndComplexity:
NodesAndComplexity[displayableDir] = accumulateComplexity
else:
NodesAndComplexity[displayableDir] = NodesAndComplexity[displayableDir] + accumulateComplexity
if (displayableDir not in dirsForDisplay):
dirsForDisplay.add(thisDir)
G.add_node(displayableDir, Physics=False)
if not isRoot and displayableDir != "./":
parent = getParentOfDir(displayableDir)
G.add_edge(parent, displayableDir)
# setting node sizes
for importingfile, importlist in pythonFile_to_imports.items():
for importt in importlist:
if importt in Map_Dirs_And_Files_To_Displaybledirs:
fromf = Map_Dirs_And_Files_To_Displaybledirs[importingfile]
to = Map_Dirs_And_Files_To_Displaybledirs[importt]
if fromf != to:
G.add_edge(Map_Dirs_And_Files_To_Displaybledirs[importingfile],Map_Dirs_And_Files_To_Displaybledirs[importt], color="red")
for node, complexity in NodesAndComplexity.items():
complexixtyDisplay = complexity / 2
G.nodes[node]["size"] = complexixtyDisplay
Displayer = Network(directed=True, height="1500px", width="100%")
Displayer.from_nx(G)
Displayer.barnes_hut(overlap=1)
Displayer.show_buttons(filter_=["physics"])
Displayer.show("pik.html")
This creates the graph just fine. However, when I create it, the graph is flying around my screen, and it is impossible to actually get a looks at it.
If I remove Displayer.barnes_hut(overlap=1), then it doesnt move, but then the nodes are all just bunched up on top of eachother, and again it is impossible to decipher the graph.
How do I get a graph that is both standing (reasonably) still and readable?
In the show_buttons function add all the buttons, and after creating the pik.html file, open the html file in Google Chrome. In the buttons option
there will be font category, there you can disable the physics option.
From then on the nodes will not move and you can distribute the nodes as you want by moving them.
I am trying to copy the data from multiple excel into one excel. I am novice to python and openpyxl. So i have opened each file and went row by row and copied them. I want to do this with multiple files. How do i loop through row and columns and copy the data consider the column in all the files are same order?
import openpyxl as xl
from openpyxl import workbook
incident_wb = xl.load_workbook('incident resolved yesterday.xlsx')
incident_sheet = incident_wb['Page 1']
combined_wb = xl.Workbook()
combined_sheet = combined_wb.active
combined_sheet.title = "combined_sheet"
combined_wb.save('combined_sheet.xlsx')
for row in range(1, incident_sheet.max_row+1):
incident_no = incident_sheet.cell(row,1)
opened_date = incident_sheet.cell(row,2)
shrt_desc = incident_sheet.cell(row,3)
requester = incident_sheet.cell(row,4)
incdnt_type = incident_sheet.cell(row,5)
priority = incident_sheet.cell(row,6)
assgn_grp = incident_sheet.cell(row,7)
assgn_to = incident_sheet.cell(row,8)
updated = incident_sheet.cell(row,9)
status = incident_sheet.cell(row,10)
sub_status = incident_sheet.cell(row,11)
##copy the data into the new sheet
incident_no_1 = combined_sheet.cell(row,1)
incident_no_1.value = incident_no.value
opened_date_1 = combined_sheet.cell(row,2)
opened_date_1.value = opened_date.value
shrt_desc_1 = combined_sheet.cell(row,3)
shrt_desc_1.value = shrt_desc.value
requester_1 = combined_sheet.cell(row,4)
requester_1.value = requester.value
incdnt_type_1 = combined_sheet.cell(row,5)
incdnt_type_1.value = incdnt_type.value
priority_1 = combined_sheet.cell(row,6)
priority_1.value = priority.value
assgn_grp_1 = combined_sheet.cell(row,7)
assgn_grp_1.value = assgn_grp.value
assgn_to_1 = combined_sheet.cell(row,8)
assgn_to_1.value = assgn_to.value
updated_1 = combined_sheet.cell(row,9)
updated_1.value = updated.value
status_1 = combined_sheet.cell(row,10)
status_1.value = status.value
sub_status_1 = combined_sheet.cell(row,11)
sub_status_1.value = sub_status.value
##print(f"The incident resolved yesterday {incident_no.value}")
combined_wb.save('combined_sheet.xlsx')
An alternative approach would be to build a list of date from multiple excel files and then write it to another file.
As a proof of concept:
import openpyxl as xl
from openpyxl import workbook
def provide_data(workbookName, sheetName):
wb = xl.load_workbook(workbookName)
sheet = wb[sheetName]
return [[y.value for y in x] for x in sheet.iter_rows()]
# This creates an array of rows, which contain an array of cell values.
# It will be much better to provide mapping for cells and return business object.
def save_data(list_of_sheets):
combined_wb = xl.Workbook()
combined_sheet = combined_wb.active
combined_sheet.title = "combined_sheet"
for sheet in list_of_sheets:
for row in sheet:
combined_sheet.append(row) # combining multiple rows.
combined_wb.save('combined_sheet.xlsx')
workSheetsToCopy = [['incident resolved yesterday.xlsx', 'Page 1'], ['other.xlsx', 'Page 1']]
workSheetsToCopy = [provide_data(x[0], x[1]) for x in workSheetsToCopy]
save_data(workSheetsToCopy)
I am trying to run through a set of CSV files in order to compile a results CSV file. I'm getting an error that my function is undefined for some reason. Can you tell me why? Thanks.
def response_amp(data):
import pandas as pd
import numpy as np
#putting in and cutting out unnecessary parts of the data
df = pd.read_csv('data.csv', encoding = 'utf-8')
df = df[:-1]
a = df.columns[df.columns.str.startswith('µ')]
df = df[a]
dfd = df.drop(df.index[:30]) #dropping the section with no sample
#splitting the data into chunks so response values can be measure
df1d = dfd[:320] #first interval
df2d = dfd[330:470] #second interval
df3d = dfd[480:] #third interval
#rolling avg on each
df1r = df1d.rolling(5, win_type='gaussian').sum(std=4)
df2r = df2d.rolling(5, win_type='gaussian').sum(std=4)
df3r = df3d.rolling(5, win_type='gaussian').sum(std=4)
bsln_1 = df1r.iloc[3:6].mean()
bsln_2 = df2r.iloc[3:6].mean()
bsln_3 = df3r.iloc[3:6].mean()
response_1 = abs(df1r.min()-bsln_1)/bsln_1
response_2 = abs(df1r.min()-bsln_2)/bsln_2
response_3 = abs(df1r.min()-bsln_3)/bsln_3
response = response_1,response_2,response_3
return(response)
import os
directory =(r'file directory goes here')
response = []
for filename in os.listdir(directory):
if filename.endswith(".csv"):
response.append(response_amp(filename))
a = numpy.asarray(response)
numpy.savetxt("ks_response.csv", a, delimiter=",")
Thanks for the help.
could anyone advise me how to apply this code to several csv in one folder? Then, save the modified csv to another folder and each separately? In short, I need to automate it.
I need to automatically load the csv file, execute the code, save the newly modified csv file, and then repeat it to the next csv file in the folder.
import pandas as pd
import datetime as dt
import numpy as np
from numpy import nan as Nan
path = "C://Users//Zemi4//Desktop//csv//A-001.csv"
df = pd.read_csv(path,delimiter=";")
df['ta'] = pd.to_numeric(df['ta'])
df['tw'] = pd.to_numeric(df['tw'])
df["time_str"] = [dt.datetime.strptime(d, "%d.%m.%Y %H:%M:%S") for d in df["time"]]
df["time_str"] = [d.date() for d in df["time_str"]]
df["time_str"] = pd.to_datetime(df["time_str"])
df["time_zaokrouhleny"]=df["time_str"]
def analyza(pozadovane_data):
new_list = []
new_df = pd.DataFrame(new_list)
new_df=df.loc[df["time_str"] == pozadovane_data,["ta","tw", "zone", "time_zaokrouhleny"]]
counter = new_df.ta.count()
if counter < 24:
for i in range(counter,24):
new_df.loc[i] = [Nan for n in range(4)]
new_df["ta"]= new_df.ta.fillna(0)
new_df["tw"] = new_df.tw.fillna(0)
new_df["zone"] = new_df.zone.fillna(0)
new_df["time_zaokrouhleny"]=new_df.time_zaokrouhleny.fillna(new_df.time_zaokrouhleny.min())
elif counter > 24:
counter_list = list(range(24,counter))
new_df = new_df.drop(new_df.index[counter_list])
new_df["time_oprava"] = [dt.datetime.combine(d.date(),dt.time(1,0)) for d in new_df["time_zaokrouhleny"]]
s = 0
cas_list = []
for d in new_df["time_oprava"]:
d =d + dt.timedelta(hours=s)
#print(d)
#print(s)
cas_list.append(d)
s = s + 1
se = pd.Series(cas_list)
new_df['time_oprava'] = se.values
new_df['Validace'] = (new_df['ta'] != 0) & (new_df['tw'] != 0)
new_df['Rozdil'] = new_df['ta'] - new_df['tw']
new_df.rename(columns={"ta": "Skutecna teplota", "tw": "Pozadovana teplota", "time_oprava": "Cas", "zone": "Mistnost"}, inplace = True)
new_df.index = new_df['Cas']
return new_df
start = dt.datetime(2010,10,6)
end = dt.datetime(2010,12,27)
date_range = []
date_range = [start + dt.timedelta(days=x) for x in range(0,(end-start).days)]
new_list = []
vysledek_df =pd.DataFrame(new_list)
for d in date_range:
pom = analyza(d)
vysledek_df = vysledek_df.append(pom,ignore_index=True)
vysledek_df.pop('time_zaokrouhleny')
vysledek_df.to_csv('C://Users//Zemi4//Desktop//zpr//A-001.csv', encoding='utf-8', index=False)
The code itself works correctly. Thank you for your advice.
Simplest way is to use glob. Just give the folder_path and output_path as per your requirements and use the sample code below. I commented the code to help you understand the code.
import os
import glob
folder_path = 'path/to/folder/' # path to folder containing .csv files
output_path = 'path/to/output/folder/' # path to output folder
for file in glob.glob(folder_path + '*.csv'): # only loads .csv files from the folder
df = pd.read_csv(file, delimiter=";") # read .csv file
# Do something
df.to_csv(output_path + 'modified_' + str(os.path.basename(file)), encoding='utf-8', index=False) # saves modified .csv file to output_path
You want to use os.listdir() to find the contents of the directory, then parameterize the file path in a new function. You can then loop over a list of directories retrieved via os.walk() and run the function for each one.
import os
def run(file_directory):
filelist = os.listdir(file_directory)
for path in filelist:
df = pd.read_csv(path,delimiter=";")
# etc.
df.to_csv(os.path.join(file_directory, 'output.csv'))
If you need to create a new directory, you can use os.mkdir(newpath)
Can you still advise on how to parameterize the function?
I am trying to merge 1000+ pdf pages, and it works with under 750 pages. If I open more than 750 it processes it, but output file is 0 bytes.
from PyPDF3 import PdfFileWriter, PdfFileReader, PdfFileMerger
import os
import sys
from collections import OrderedDict
import win32file
win32file._setmaxstdio(8192)
print(win32file._getmaxstdio())
sys.setrecursionlimit(30000)
nameOfFile = os.path.basename(os.getcwd())
#get page number
def getPageNr(arg1):
stro = str(arg1)
stro=stro.replace('.pdf', '')
listR = stro.split(' - ')
listR[len(listR)-1] = listR[len(listR)-1].replace('-','')
listR[len(listR)-1] = listR[len(listR)-1].replace('Page ','')
pgNr=int(listR[len(listR)-1])
return pgNr
currentFolder = os.getcwd()
pdffiles = [os.path.join(name)
for root, dirs, files in os.walk(currentFolder)
for name in files
if name.endswith((".pdf"))]
#create dictionary and get whole list
di={}
#direct copy and create key from page number on back and value is original list
for string in pdffiles:
di.setdefault(getPageNr(string),str(string))
#sort it by keys
di2 = OrderedDict(sorted(di.items()))
pdffiles.clear()
for key,values in di2.items():
pdffiles.append(values)
#put a correction
pageAt = 0
adder = 421
pageAt = pageAt + adder
#add global variables for page in bookmark
mainTitlePage = 0
secondTitlePage = 0
thirdTitlePage = 0
#define globals for bookmarks
mainTitle = ''
SecondTitle = ''
thirdTitle = ''
#define previous bookmarks
lastMainTitle = ''
lastSecondTitle = ''
lastThirdTitle = ''
#if main title is same as next page
isSame = True
#start Merger
editer = PdfFileMerger()
#start main loop
while pageAt<(adder+2000) and pageAt<len(pdffiles) and isSame:
#break filename to titles
titles = pdffiles[pageAt].split(' - ')
#break next page for titles
titlesNext = pdffiles[pageAt+1].split(' - ')
#get titles
mainTitle = titles[0]
secondTitle = titles[1]
if not titlesNext[0] == mainTitle:
isSame = False
hasThird = False
if len(titles)>4:
thirdTitle = titles[2]
hasThird = True
else:
thirdTitle = None
hasThird = False
#open individual page
kStream = open(pdffiles[pageAt], 'rb')
inputK = PdfFileReader(kStream)
#test if titles are changing
if not mainTitle == lastMainTitle:
KmainParent = editer.addBookmark(mainTitle, 0)
if not secondTitle == lastSecondTitle:
secondTitlePage = pageAt-adder
#print(secondTitle)
Kparent = editer.addBookmark(secondTitle, secondTitlePage, KmainParent)
if hasThird:
if not thirdTitle == lastThirdTitle:
thirdTitlePage = pageAt-adder
Mparent = editer.addBookmark(thirdTitle, thirdTitlePage, Kparent)
editer.addBookmark(titles[3], pageAt-adder, Mparent)
else:
editer.addBookmark(titles[2], pageAt-adder, Kparent)
#merge page with fixed bookmarks
editer.merge((pageAt - adder), inputK)
#get titles and save them for future
lastMainTitle = mainTitle
lastSecondTitle = secondTitle
lastThirdTitle = thirdTitle
#go to next page
pageAt += 1
#get name for output file
nameOfFile = mainTitle + '.pdf'
print('Saving ' + nameOfFile)
#start new file and export it
outR = open(nameOfFile, 'wb')
editer.write(outR)
outR.close()
kStream.close()
Now it puts all bookmarks, no problem there. But how to process more than 750 pages.
I have increased recursion limit and maxstdio...but if there are 1000 or more pages, merged file is 0 bytes, but process takes minute or two, so it is processing.
I do not get any of errors.
Can anybody help me to process more than 500 pages