Pandas Dataframe returns None after recursive function? - python

I've written a simple script to save out the names of various subfolders into a spreadsheet. It seems to be doing its job at every point up to the return statement. It returns None...
If I add a print statement before the return I can see a populated dataFrame.
I guess I'm missing something obvious, would appreciate some help!
Thanks
import sys, os, glob
from glob import glob
import pandas as pd
def findSubFoldersMultiple(iter,data_container):
if iter > 0:
current_directory = sys.argv[iter]
directory_reformatted = sys.argv[iter] + "/*/"
folders = glob(directory_reformatted )
folders_stripped = [ folder.replace(sys.argv[iter],'').replace('/','') for folder in folders]
curr_data_container = pd.DataFrame({ current_directory: folders_stripped })
combined_data_container = pd.concat([data_container,curr_data_container],axis=1)
findSubFoldersMultiple(iter-1,combined_data_container)
else:
print('Populated container in loop: \n' )
print(data_container)
return data_container
if len(sys.argv)<2:
print ("Please specify directory/directories.")
else:
writer = pd.ExcelWriter('subfolders.xlsx')
empty_frame = pd.DataFrame({})
populated_DF = findSubFoldersMultiple(len(sys.argv) - 1, empty_frame)
print('Returned container: \n' )
print(populated_DF)

Catch the return value by changing the last line in the if block to:
return findSubFoldersMultiple(iter-1,combined_data_container)
Otherwise you're returning the value on the base case (the else block), but not returning it further up the chain of non-base case recursive calls.

Related

Checking len of azure blob storage folder causes function to not run

Weird problem I've run into. I'm currently using the following code:
generic.py
def function_in_different_pyfile(input_folder):
# do stuff here
folder_1 = f"/folder_1"
folder_1_virtualdir = CONTAINER_CLIENT.list_blobs(name_starts_with=folder_1)
folder_2 = f"/folder_2"
folder_2_virtualdir = CONTAINER_CLIENT.list_blobs(name_starts_with=folder_2)
if len([file for file in folder_1_virtualdir]) !=(len([file for file in folder_2_virtualdir]):
generic.function_in_different_pyfile(folder_1_virtualdir)
else:
print('Already done')
So what I'm trying to do is:
Check the number of files in folder_1_virtualdir and folder_2_virtualdir
If they aren't equal, run the function.
If they are, then print statement/pass.
The problem:
The generic.function() runs although doesn't do anything when you pass in the list comprehension.
The generic.function() works totally fine if you don't have a list comprehension in the code e.g:
folder_1 = f"/folder_1"
folder_1_virtualdir = CONTAINER_CLIENT.list_blobs(name_starts_with=folder_1)
folder_2 = f"/folder_2"
folder_2_virtualdir = CONTAINER_CLIENT.list_blobs(name_starts_with=folder_2)
generic.function_in_different_pyfile(folder_1_virtualdir)
Will work completely fine.
There are no error messages. It passes through the function as if the function doesn't do anything.
What I've tried:
I've tested this by modifying the function:
generic.py
def function_in_different_pyfile(input_folder):
print('Start of the function')
# do stuff here
print('End of the function')
You will see these print statements although the function doesn't process any of the files in the input_folder argument if you include the list comprehension.
This is extended to when the list comprehension is ANYWHERE in the code:
folder_1 = f"/folder_1"
folder_1_virtualdir = CONTAINER_CLIENT.list_blobs(name_starts_with=folder_1)
folder_1_contents = [file for file in folder_1_virtualdir]
folder_2 = f"/folder_2"
folder_2_virtualdir = CONTAINER_CLIENT.list_blobs(name_starts_with=folder_2)
generic.function_in_different_pyfile(folder_1_virtualdir)
# Function doesn't run.
I'm fairly new to Python although can't seem to understand why the list comprehension here completely prevents the function from running correctly.
You could try the code if the number of files in the folder is less than 5000:
folder_1 = f"/folder_1"
folder_1_virtualdir = CONTAINER_CLIENT.list_blobs(name_starts_with=folder_1)
folder_2 = f"/folder_2"
folder_2_virtualdir = CONTAINER_CLIENT.list_blobs(name_starts_with=folder_2)
folder_1_count = len(folder_1_virtualdir)
folder_2_count = len(folder_2_virtualdir)
if folder_1_count != folder_2_count :
generic.function_in_different_pyfile(folder_1_virtualdir)
else:
print('Already done')
If greater than 5000, you need to get the number iterating through your blob.
count = 0
for count, item in enumerate(blobs):
print("number", count + 1, "in the list is", item)

Updating Appending List to a txt file

Hello currently i am studying python and i wanted to know on how you can have a list that is being appended if there is a change constantly to a txtfile. Wording is a hard here is the code anyways
list=[]
random_number=0
file_handler=open("history.txt","w")
file_handler.write(str(list))
lenght_cumulative_data=len(list)
confirmed.append(random_number)
Now what i want to accomplish is that the list variable of the number 0 would be shown in history.txt but that doesnt happen and lets just imagine that random_number is always changing I want the list variable to be able to always update itself. Like if let say random_number changes to 1 and then 2 I want list to be updated to [0,1,2]. How do you do that? I've been searching on youtube and all they gave me is this write function is there anyway someone could refrence it or have any ideas?
from os import stat
from _thread import start_new_thread
from time import sleep
List = []
class WatchFileForChanges:
def __init__(self, filename):
self.file = filename
self.cached_file = stat(self.file).st_mtime
def watch(self):
num = 0
while 1:
status = stat(self.file).st_mtime
if status != self.cached_file:
self.cached_file = status
#file changed
List.append(num)
num += 1
def main():
Watcher = WatchFileForChanges("file.txt")
start_new_thread(Watcher.watch, ())
while 1:
print(List)
sleep(1)
if __name__ == '__main__':
main()
This will do what you want.
If I understood you correctly, you want to append to the list every time a file changes.
Note: this answer will only work on Windows
changes.py:
# Adapted from http://timgolden.me.uk/python/win32_how_do_i/watch_directory_for_changes.html
import threading
import os
import win32file
import win32con
ACTIONS = {
1 : "Created",
2 : "Deleted",
3 : "Updated",
4 : "Renamed from something",
5 : "Renamed to something"
}
# Thanks to Claudio Grondi for the correct set of numbers
FILE_LIST_DIRECTORY = 0x0001
def monitor_changes(callback, path, filenames):
path = path or ""
if type(filenames) == "str":
filenames = (filenames,)
thread = threading.Thread(target=_monitor, args=(callback, path, filenames))
thread.start()
return thread
def _monitor(callback, path, filenames):
hDir = win32file.CreateFile (
path,
FILE_LIST_DIRECTORY,
win32con.FILE_SHARE_READ | win32con.FILE_SHARE_WRITE | win32con.FILE_SHARE_DELETE,
None,
win32con.OPEN_EXISTING,
win32con.FILE_FLAG_BACKUP_SEMANTICS,
None
)
while True:
#
# ReadDirectoryChangesW takes a previously-created
# handle to a directory, a buffer size for results,
# a flag to indicate whether to watch subtrees and
# a filter of what changes to notify.
#
# NB Tim Juchcinski reports that he needed to up
# the buffer size to be sure of picking up all
# events when a large number of files were
# deleted at once.
#
results = win32file.ReadDirectoryChangesW (
hDir,
1024,
True,
win32con.FILE_NOTIFY_CHANGE_LAST_WRITE,
None,
None
)
for action, file in results:
if filenames and file not in filenames and os.path.basename(file) not in filenames:
continue
callback(action, file)
if __name__ == '__main__':
# monitor by printing
t = monitor_changes(print, ".", None)
And in your main.py:
import changes
import os
my_list = []
def callback(action_id, filename):
# the function running means
# that the file has been modified
action_desc = changes.ACTIONS[action_id]
print(action_desc, filename)
with open(filename) as f:
my_list.append(f.read())
thread = changes.monitor_changes(callback, ".", "my_file_that_I_want_to_monitor.txt")
If you want to monitor all files in the directory, call monitor_changes with None as the third argument.
Note: this will monitor all subdirectories, so files with the same name but in different folders will trigger the callback. If you want to avoid this, then check the filename passed to your callback function is exactly what you want to monitor.

Weird sequence when writing into txt document

Hi everyone this is my first time here, and I am a beginner in Python. I am in the middle of writing a program that returns a txt document containing information about a stock (Watchlist Info.txt), based on the input of another txt document containing the company names (Watchlist).
To achieve this, I have written 3 functions, of which 2 functions reuters_ticker() and stock_price() are completed as shown below:
def reuters_ticker(desired_search):
#from company name execute google search for and return reuters stock ticker
try:
from googlesearch import search
except ImportError:
print('No module named google found')
query = desired_search + ' reuters'
for j in search(query, tld="com.sg", num=1, stop=1, pause=2):
result = j
ticker = re.search(r'\w+\.\w+$', result)
return ticker.group()
Stock Price:
def stock_price(company, doc=None):
ticker = reuters_ticker(company)
request = 'https://www.reuters.com/companies/' + ticker
raw_main = pd.read_html(request)
data1 = raw_main[0]
data1.set_index(0, inplace=True)
data1 = data1.transpose()
data2 = raw_main[1]
data2.set_index(0, inplace=True)
data2 = data2.transpose()
stock_info = pd.concat([data1,data2], axis=1)
if doc == None:
print(company + '\n')
print('Previous Close: ' + str(stock_info['Previous Close'][1]))
print('Forward PE: ' + str(stock_info['Forward P/E'][1]))
print('Div Yield(%): ' + str(stock_info['Dividend (Yield %)'][1]))
else:
from datetime import date
with open(doc, 'a') as output:
output.write(date.today().strftime('%d/%m/%y') + '\t' + str(stock_info['Previous Close'][1]) + '\t' + str(stock_info['Forward P/E'][1]) + '\t' + '\t' + str(stock_info['Dividend (Yield %)'][1]) + '\n')
output.close()
The 3rd function, watchlist_report(), is where I am getting problems with writing the information in the format as desired.
def watchlist_report(watchlist):
with open(watchlist, 'r') as companies, open('Watchlist Info.txt', 'a') as output:
searches = companies.read()
x = searches.split('\n')
for i in x:
output.write(i + ':\n')
stock_price(i, doc='Watchlist Info.txt')
output.write('\n')
When I run watchlist_report('Watchlist.txt'), where Watchlist.txt contains 'Apple' and 'Facebook' each on new lines, my output is this:
26/04/20 275.03 22.26 1.12
26/04/20 185.13 24.72 --
Apple:
Facebook:
Instead of what I want and would expect based on the code I have written in watchlist_report():
Apple:
26/04/20 275.03 22.26 1.12
Facebook:
26/04/20 185.13 24.72 --
Therefore, my questions are:
1) Why is my output formatted this way?
2) Which part of my code do I have to change to make the written output in my desired format?
Any other suggestions about how I can clean my code and any libraries I can use to make my code nicer are also appreciated!
You handle two different file-handles - the file-handle inside your watchlist_report gets closed earlier so its being written first, before the outer functions file-handle gets closed, flushed and written.
Instead of creating a new open(..) in your function, pass the current file handle:
def watchlist_report(watchlist):
with open(watchlist, 'r') as companies, open('Watchlist Info.txt', 'a') as output:
searches = companies.read()
x = searches.split('\n')
for i in x:
output.write(i + ':\n')
stock_price(i, doc = output) # pass the file handle
output.write('\n')
Inside def stock_price(company, doc=None): use the provided filehandle:
def stock_price(company, output = None): # changed name here
# [snip] - removed unrelated code for this answer for brevity sake
if output is None: # check for None using IS
print( ... ) # print whatever you like here
else:
from datetime import date
output.write( .... ) # write whatever you want it to write
# output.close() # do not close, the outer function does this
Do not close the file handle in the inner function, the context handling with(..) of the outer function does that for you.
The main takeaway for file handling is that things you write(..) to your file are not neccessarily placed there immediately. The filehandler chooses when to actually persist data to your disk, the latests it does that is when it goes out of scope (of the context handler) or when its internal buffer reaches some threshold so it "thinks" it is now prudent to alter to data on your disc. See How often does python flush to a file? for more infos.

Creating loop for __main__

I am new to Python, and I want your advice on something.
I have a script that runs one input value at a time, and I want it to be able to run a whole list of such values without me typing the values one at a time. I have a hunch that a "for loop" is needed for the main method listed below. The value is "gene_name", so effectively, i want to feed in a list of "gene_names" that the script can run through nicely.
Hope I phrased the question correctly, thanks! The chunk in question seems to be
def get_probes_from_genes(gene_names)
import json
import urllib2
import os
import pandas as pd
api_url = "http://api.brain-map.org/api/v2/data/query.json"
def get_probes_from_genes(gene_names):
if not isinstance(gene_names,list):
gene_names = [gene_names]
#in case there are white spaces in gene names
gene_names = ["'%s'"%gene_name for gene_name in gene_names]**
api_query = "?criteria=model::Probe"
api_query= ",rma::criteria,[probe_type$eq'DNA']"
api_query= ",products[abbreviation$eq'HumanMA']"
api_query= ",gene[acronym$eq%s]"%(','.join(gene_names))
api_query= ",rma::options[only$eq'probes.id','name']"
data = json.load(urllib2.urlopen(api_url api_query))
d = {probe['id']: probe['name'] for probe in data['msg']}
if not d:
raise Exception("Could not find any probes for %s gene. Check " \
"http://help.brain- map.org/download/attachments/2818165/HBA_ISH_GeneList.pdf? version=1&modificationDate=1348783035873 " \
"for list of available genes."%gene_name)
return d
def get_expression_values_from_probe_ids(probe_ids):
if not isinstance(probe_ids,list):
probe_ids = [probe_ids]
#in case there are white spaces in gene names
probe_ids = ["'%s'"%probe_id for probe_id in probe_ids]
api_query = "? criteria=service::human_microarray_expression[probes$in%s]"% (','.join(probe_ids))
data = json.load(urllib2.urlopen(api_url api_query))
expression_values = [[float(expression_value) for expression_value in data["msg"]["probes"][i]["expression_level"]] for i in range(len(probe_ids))]
well_ids = [sample["sample"]["well"] for sample in data["msg"] ["samples"]]
donor_names = [sample["donor"]["name"] for sample in data["msg"] ["samples"]]
well_coordinates = [sample["sample"]["mri"] for sample in data["msg"] ["samples"]]
return expression_values, well_ids, well_coordinates, donor_names
def get_mni_coordinates_from_wells(well_ids):
package_directory = os.path.dirname(os.path.abspath(__file__))
frame = pd.read_csv(os.path.join(package_directory, "data", "corrected_mni_coordinates.csv"), header=0, index_col=0)
return list(frame.ix[well_ids].itertuples(index=False))
if __name__ == '__main__':
probes_dict = get_probes_from_genes("SLC6A2")
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)
whoa, first things first. Python ain't Java, so do yourself a favor and use a nice """xxx\nyyy""" string, with triple quotes to multiline.
api_query = """?criteria=model::Probe"
,rma::criteria,[probe_type$eq'DNA']
...
"""
or something like that. you will get white spaces as typed, so you may need to adjust.
If, like suggested, you opt to loop on the call to your function through a file, you will need to either try/except your data-not-found exception or you will need to handle missing data without throwing an exception. I would opt for returning an empty result myself and letting the caller worry about what to do with it.
If you do opt for raise-ing an Exception, create your own, rather than using a generic exception. That way your code can catch your expected Exception first.
class MyNoDataFoundException(Exception):
pass
#replace your current raise code with...
if not d:
raise MyNoDataFoundException(your message here)
clarification about catching exceptions, using the accepted answer as a starting point:
if __name__ == '__main__':
with open(r"/tmp/genes.txt","r") as f:
for line in f.readlines():
#keep track of your input data
search_data = line.strip()
try:
probes_dict = get_probes_from_genes(search_data)
except MyNoDataFoundException, e:
#and do whatever you feel you need to do here...
print "bummer about search_data:%s:\nexception:%s" % (search_data, e)
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)
You may want to create a file with Gene names, then read content of the file and call your function in the loop. Here is an example below
if __name__ == '__main__':
with open(r"/tmp/genes.txt","r") as f:
for line in f.readlines():
probes_dict = get_probes_from_genes(line.strip())
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)

Successive multiprocessing

I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.

Categories

Resources