Use of 'and' in if statements - python

I need to check thousands of directories for two kinds of files. I have restricted to the index, or idx, to less than four since within that range there would be the two kinds of files that need to be found, the 'jpg' and the '.thmb'. But I need the the if statement to require that those two kinds of files are in the directory. The if statement:
if ('.jpg' in val) and ('thmb' in val):
works except I keep getting printout through the else statement that data is missing, when it is not true:
Data missing W:\\North2015\200\10 200001000031.jpg 0
Data missing W:\\North2015\200\10 200001000032.jpg 1
Data missing W:\\North2015\200\100 200014000001.jpg 0
Data missing W:\\North2015\200\100 200014000002.jpg 1
Data missing W:\\North2015\200\101 200014100081.jpg 2
Here is the code below:
def missingFileSearch():
for folder in setFinder():
for idx,val in enumerate(os.listdir(folder)):
if idx < 4:
if ('.jpg' in val) and ('thmb' in val):
pass
else:
print'Data missing',folder,val,idx
So i am wondering why I am getting the output through the else statement.
Also, this line of code gets hung up:
if val.endswith('.jpg') and ('thmb' in val):
print'Data is here!',folder,val,idx
This is chiefly what I need the code to do.

I would do this:
def missingFileSearch():
folders_with_missing = []
for folder in setFinder():
thmb_found = False
jpg_found = False
for fname in os.listdir(folder):
thmb_found |= 'thmb' in fname
jpg_found |= fname.endswith('.jpg')
if thmb_found and jpg_found:
break # break inner loop, move on to check next folder
else: # loop not broken
if not thmb_found and not jpg_found:
desc = "no thmb, no .jpg"
elif not thmb_found:
desc = "no thmb"
else:
desc = "no .jpg"
folders_with_missing.append((folder, desc))
return folders_with_missing
I have tested a slightly modified version of this code (no setFinder() function):
def missingFileSearch():
folders_with_missing = []
for folder in os.listdir(my_dir):
thmb_found = False
jpg_found = False
for fname in os.listdir(os.path.join(my_dir, folder)):
thmb_found |= 'thmb' in fname
jpg_found |= fname.endswith('.jpg')
if thmb_found and jpg_found:
break # break inner loop, move on to check next folder
else: # loop not broken
if not thmb_found and not jpg_found:
desc = "no thmb, no .jpg"
elif not thmb_found:
desc = "no thmb"
else:
desc = "no .jpg"
folders_with_missing.append((folder, desc))
return folders_with_missing
I created four test folders with self explanatory names:
>>> os.listdir(my_dir)
['both_thmb_jpg', 'missing_jpg', 'missing_thmb', 'no_files']
Then ran the function:
>>> missingFileSearch()
[('missing_jpg', 'no .jpg'), ('missing_thmb', 'no thmb'), ('no_files', 'no thmb, no .jpg')]

Related

Simplifying code blocks with functions [Python]

Im searching for a way to simplify my code via functions. 90% of my operation are equal and only differ from the if condition.
E.g.
if isFile:
fFound = False
for key in files:
if item["path"] in key:
fFound = True
for c in cmds.keys():
if item["path"] in cmds[c]["files"]:
ifxchecker(item["requiredIFX"], cmds[c]["ifx_match"])
outputCFG()
if not fFound:
notFound.append(item['path'])
else:
dir = item["path"][:-1]
pFound = False
for key in files:
if dir in key:
pFound = True
for c in cmds.keys():
for file in cmds[c]["files"]:
if dir in file:
ifxchecker(item["requiredIFX"], cmds[c]["ifx_match"])
outputCFG()
if not pFound:
notFound.append(dir)
My code is working fine, I'm just trying to get the most of it in a function and only differ from these small if conditions. I can't find a way to simplify it and I'm not even sure if there is.
I did some small functions as you see but I think there would be a better way to simplify the whole construct.
Unfortunately can't test it, because multiple vars and methods are not defined, but it seems to work. Maybe using is_dir bool variable instead of elem will be better, if you'd like: replace elem with is_dir and add the following line to the beginning of function:
elem = item["path"][:-1] if is_dir else item["path"]
def do_stuff(elem, files, item, cmds, notFound):
fFound = False
for key in files:
if elem in key:
fFound = True
for c in cmds.keys():
if elem in cmds[c]["files"]:
ifxchecker(item["requiredIFX"], cmds[c]["ifx_match"])
outputCFG()
if not fFound:
return elem
if isFile:
res = do_stuff(item["path"], files, item, cmds)
if res is not None:
notFound.append(res)
else:
do_stuff(item["path"][:-1], files, item, cmds)
if res is not None:
notFound.append(res)
I solved it like that with #azro method:
def cfgFunction(x):
global file
fFound = False
for file in files:
if x in file:
fFound = True
for group in cmds.keys():
if x in cmds[group]["files"]:
ifxchecker(item["requiredIFX"], cmds[group]["ifx_match"])
outputCFG()
if not fFound:
notFound.append(x)

Python for loop only running once?

This script looks interrogates a csv containing species names against a database in a csv and returns if they are in both. The issue is while it is still reading all the terms to search fine, it is only searching the first one. i.e. if I print speciesl before 'for row in p' all species names are returned correctly
from pathlib import Path
import os
import csv
p = csv.reader(open('Paldat.csv','r',newline=''), delimiter=',')
with open('newsssssss.csv','r',newline='\n')as r:
for line in r:
taxons=line.split(',')
no = ['\r\n']
noo = ['\n']
if taxons == no:
continue
elif taxons == noo:
continue
else:
speciesl = []
for val in taxons:
val = val.replace('\n','')
speciesl.append(val)
g=speciesl[0].lower()
if len(speciesl) < 2:
continue
else:
s=speciesl[1].lower()
for row in p: #This loop seems to be the issue
genus = row[0].lower()
species = row[1].lower()
if g == genus and s == species:
print('Perfect match')
print(g)
elif s == species:
print(speciesl)
print('Species found')
else:
continue
else:
continue
Here is part of Paldat.csv:
Camassia,leichtlinii,monad,monad,large (51-100 µm),-,-,-,-,-,sulcate,heteropolar,oblate,-,elliptic,-,-,boat-shaped,no suitable term,aperture(s) sunken,1,sulcus,sulcate,aperture membrane ornamented,-,-,-,"reticulate, heterobrochate, perforate",-,-,-,-,-,-,-,-,-,-,-,present,,
Cistus,parviflorus,monad,monad,medium-sized (26-50 µm),-,-,-,-,-,colporate,isopolar,-,spheroidal,circular,-,-,spheroidal,circular,"aperture(s) sunken, not infolded",3,colporus,"colporate, tricolporate",-,-,-,-,striato-reticulate,-,-,-,-,-,-,-,-,-,-,-,absent,,
Camellia,japonica,monad,monad,medium-sized (26-50 µm),41-50 µm,36-40 µm,41-50 µm,41-50 µm,41-50 µm,colpate,isopolar,-,spheroidal,circular,oblique,prolate,-,triangular,aperture(s) sunken,3,colpus,"colpate, tricolpate",operculum,"granulate, scabrate, reticulate",-,-,microreticulate,-,-,-,-,-,-,-,-,-,-,-,-,,
Camellia,sinensis,monad,monad,medium-sized (26-50 µm),41-50 µm,36-40 µm,41-50 µm,41-50 µm,41-50 µm,colporate,isopolar,oblate,-,triangular,oblique,isodiametric,-,triangular,aperture(s) sunken,3,colporus,"colporate, tricolporate",operculum,"scabrate, verrucate, gemmate",-,-,"verrucate, perforate",-,-,-,-,-,-,-,-,-,-,-,-,,
And part of newsssssss.csv:
Camassia,leichtlinii
Camellia,japonica
Camellia,sinensis
Chrysanthemum,leucanthemum
Cirsium,arvense
Cissus,quadrangularis
Try removing "newline='\n'" from the "open" line.

Python 3.6 script surprisingly slow on Windows 10 but not on Ubuntu 17.10

I recently had to write a challenge for a company that was to merge 3 CSV files into one based on the first attribute of each (the attributes were repeating in all files).
I wrote the code and sent it to them, but they said it took 2 minutes to run. That was funny because it ran for 10 seconds on my machine. My machine had the same processor, 16GB of RAM, and had an SSD as well. Very similar environments.
I tried optimising it and resubmitted it. This time they said they ran it on an Ubuntu machine and got 11 seconds, while the code ran for 100 seconds on the Windows 10 still.
Another peculiar thing was that when I tried profiling it with the Profile module, it went on forever, had to terminate after 450 seconds. I moved to cProfiler and it recorded it for 7 seconds.
EDIT: The exact formulation of the problem is
Write a console program to merge the files provided in a timely and
efficient manner. File paths should be supplied as arguments so that
the program can be evaluated on different data sets. The merged file
should be saved as CSV; use the id column as the unique key for
merging; the program should do any necessary data cleaning and error
checking.
Feel free to use any language you’re comfortable with – only
restriction is no external libraries as this defeats the purpose of
the test. If the language provides CSV parsing libraries (like
Python), please avoid using them as well as this is a part of the
test.
Without further ado here's the code:
#!/usr/bin/python3
import sys
from multiprocessing import Pool
HEADERS = ['id']
def csv_tuple_quotes_valid(a_tuple):
"""
checks if a quotes in each attribute of a entry (i.e. a tuple) agree with the csv format
returns True or False
"""
for attribute in a_tuple:
in_quotes = False
attr_len = len(attribute)
skip_next = False
for i in range(0, attr_len):
if not skip_next and attribute[i] == '\"':
if i < attr_len - 1 and attribute[i + 1] == '\"':
skip_next = True
continue
elif i == 0 or i == attr_len - 1:
in_quotes = not in_quotes
else:
return False
else:
skip_next = False
if in_quotes:
return False
return True
def check_and_parse_potential_tuple(to_parse):
"""
receives a string and returns an array of the attributes of the csv line
if the string was not a valid csv line, then returns False
"""
a_tuple = []
attribute_start_index = 0
to_parse_len = len(to_parse)
in_quotes = False
i = 0
#iterate through the string (line from the csv)
while i < to_parse_len:
current_char = to_parse[i]
#this works the following way: if we meet a quote ("), it must be in one
#of five cases: "" | ", | ," | "\0 | (start_of_string)"
#in case we are inside a quoted attribute (i.e. "123"), then commas are ignored
#the following code also extracts the tuples' attributes
if current_char == '\"':
if i == 0 or (to_parse[i - 1] == ',' and not in_quotes): # (start_of_string)" and ," case
#not including the quote in the next attr
attribute_start_index = i + 1
#starting a quoted attr
in_quotes = True
elif i + 1 < to_parse_len:
if to_parse[i + 1] == '\"': # "" case
i += 1 #skip the next " because it is part of a ""
elif to_parse[i + 1] == ',' and in_quotes: # ", case
a_tuple.append(to_parse[attribute_start_index:i].strip())
#not including the quote and comma in the next attr
attribute_start_index = i + 2
in_quotes = False #the quoted attr has ended
#skip the next comma - we know what it is for
i += 1
else:
#since we cannot have a random " in the middle of an attr
return False
elif i == to_parse_len - 1: # "\0 case
a_tuple.append(to_parse[attribute_start_index:i].strip())
#reached end of line, so no more attr's to extract
attribute_start_index = to_parse_len
in_quotes = False
else:
return False
elif current_char == ',':
if not in_quotes:
a_tuple.append(to_parse[attribute_start_index:i].strip())
attribute_start_index = i + 1
i += 1
#in case the last attr was left empty or unquoted
if attribute_start_index < to_parse_len or (not in_quotes and to_parse[-1] == ','):
a_tuple.append(to_parse[attribute_start_index:])
#line ended while parsing; i.e. a quote was openned but not closed
if in_quotes:
return False
return a_tuple
def parse_tuple(to_parse, no_of_headers):
"""
parses a string and returns an array with no_of_headers number of headers
raises an error if the string was not a valid CSV line
"""
#get rid of the newline at the end of every line
to_parse = to_parse.strip()
# return to_parse.split(',') #if we assume the data is in a valid format
#the following checking of the format of the data increases the execution
#time by a factor of 2; if the data is know to be valid, uncomment 3 lines above here
#if there are more commas than fields, then we must take into consideration
#how the quotes parse and then extract the attributes
if to_parse.count(',') + 1 > no_of_headers:
result = check_and_parse_potential_tuple(to_parse)
if result:
a_tuple = result
else:
raise TypeError('Error while parsing CSV line %s. The quotes do not parse' % to_parse)
else:
a_tuple = to_parse.split(',')
if not csv_tuple_quotes_valid(a_tuple):
raise TypeError('Error while parsing CSV line %s. The quotes do not parse' % to_parse)
#if the format is correct but more data fields were provided
#the following works faster than an if statement that checks the length of a_tuple
try:
a_tuple[no_of_headers - 1]
except IndexError:
raise TypeError('Error while parsing CSV line %s. Unknown reason' % to_parse)
#this replaces the use my own hashtables to store the duplicated values for the attributes
for i in range(1, no_of_headers):
a_tuple[i] = sys.intern(a_tuple[i])
return a_tuple
def read_file(path, file_number):
"""
reads the csv file and returns (dict, int)
the dict is the mapping of id's to attributes
the integer is the number of attributes (headers) for the csv file
"""
global HEADERS
try:
file = open(path, 'r');
except FileNotFoundError as e:
print("error in %s:\n%s\nexiting...")
exit(1)
main_table = {}
headers = file.readline().strip().split(',')
no_of_headers = len(headers)
HEADERS.extend(headers[1:]) #keep the headers from the file
lines = file.readlines()
file.close()
args = []
for line in lines:
args.append((line, no_of_headers))
#pool is a pool of worker processes parsing the lines in parallel
with Pool() as workers:
try:
all_tuples = workers.starmap(parse_tuple, args, 1000)
except TypeError as e:
print('Error in file %s:\n%s\nexiting thread...' % (path, e.args))
exit(1)
for a_tuple in all_tuples:
#add quotes to key if needed
key = a_tuple[0] if a_tuple[0][0] == '\"' else ('\"%s\"' % a_tuple[0])
main_table[key] = a_tuple[1:]
return (main_table, no_of_headers)
def merge_files():
"""
produces a file called merged.csv
"""
global HEADERS
no_of_files = len(sys.argv) - 1
processed_files = [None] * no_of_files
for i in range(0, no_of_files):
processed_files[i] = read_file(sys.argv[i + 1], i)
out_file = open('merged.csv', 'w+')
merged_str = ','.join(HEADERS)
all_keys = {}
#this is to ensure that we include all keys in the final file.
#even those that are missing from some files and present in others
for processed_file in processed_files:
all_keys.update(processed_file[0])
for key in all_keys:
merged_str += '\n%s' % key
for i in range(0, no_of_files):
(main_table, no_of_headers) = processed_files[i]
try:
for attr in main_table[key]:
merged_str += ',%s' % attr
except KeyError:
print('NOTE: no values found for id %s in file \"%s\"' % (key, sys.argv[i + 1]))
merged_str += ',' * (no_of_headers - 1)
out_file.write(merged_str)
out_file.close()
if __name__ == '__main__':
# merge_files()
import cProfile
cProfile.run('merge_files()')
# import time
# start = time.time()
# print(time.time() - start);
Here is the profiler report I got on my Windows.
EDIT: The rest of the csv data provided is here. Pastebin was taking too long to process the files, so...
It might not be the best code and I know that, but my question is what slows down Windows so much that doesn't slow down an Ubuntu? The merge_files() function takes the longest, with 94 seconds just for itself, not including the calls to other functions. And there doesn't seem to be anything too obvious to me for why it is so slow.
Thanks
EDIT: Note: We both used the same dataset to run the code with.
It turns out that Windows and Linux handle very long strings differently. When I moved the out_file.write(merged_str) inside the outer for loop (for key in all_keys:) and stopped appending to merged_str, it ran for 11 seconds as expected. I don't have enough knowledge on either of the OS's memory management systems to be able to give a prediction on why it is so different.
But I would say that the way that the second one (the Windows one) is the more fail-safe method because it is unreasonable to keep a 30 MB string in memory. It just turns out that Linux sees that and doesn't always try to keep the string in cache, or to rebuild it every time.
Funny enough, initially I did run it a few times on my Linux machine with these same writing strategies, and the one with the large string seemed to go faster, so I stuck with it. I guess you never know.
Here's the modified code
for key in all_keys:
merged_str = '%s' % key
for i in range(0, no_of_files):
(main_table, no_of_headers) = processed_files[i]
try:
for attr in main_table[key]:
merged_str += ',%s' % attr
except KeyError:
print('NOTE: no values found for id %s in file \"%s\"' % (key, sys.argv[i + 1]))
merged_str += ',' * (no_of_headers - 1)
out_file.write(merged_str + '\n')
out_file.close()
When I run your solution on Ubuntu 16.04 with the three given files, it seems to take ~8 seconds to complete. The only modification I made was to uncomment the timing code at the bottom and use it.
$ python3 dimitar_merge.py file1.csv file2.csv file3.csv
NOTE: no values found for id "aaa5d09b-684b-47d6-8829-3dbefd608b5e" in file "file2.csv"
NOTE: no values found for id "38f79a49-4357-4d5a-90a5-18052ef03882" in file "file2.csv"
NOTE: no values found for id "766590d9-4f5b-4745-885b-83894553394b" in file "file2.csv"
8.039648056030273
$ python3 dimitar_merge.py file1.csv file2.csv file3.csv
NOTE: no values found for id "38f79a49-4357-4d5a-90a5-18052ef03882" in file "file2.csv"
NOTE: no values found for id "766590d9-4f5b-4745-885b-83894553394b" in file "file2.csv"
NOTE: no values found for id "aaa5d09b-684b-47d6-8829-3dbefd608b5e" in file "file2.csv"
7.78482985496521
I rewrote my first attempt without using csv from the standard library and am now getting times of ~4.3 seconds.
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.332579612731934
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.305467367172241
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.27345871925354
This is my solution code (lettuce_merge.py):
from collections import defaultdict
def split_row(csv_row):
return [col.strip('"') for col in csv_row.rstrip().split(',')]
def merge_csv_files(files):
file_headers = []
merged_headers = []
for i, file in enumerate(files):
current_header = split_row(next(file))
unique_key, *current_header = current_header
if i == 0:
merged_headers.append(unique_key)
merged_headers.extend(current_header)
file_headers.append(current_header)
result = defaultdict(lambda: [''] * (len(merged_headers) - 1))
for file_header, file in zip(file_headers, files):
for line in file:
key, *values = split_row(line)
for col_name, col_value in zip(file_header, values):
result[key][merged_headers.index(col_name) - 1] = col_value
file.close()
quotes = '"{}"'.format
with open('lettuce_merged.csv', 'w') as f:
f.write(','.join(quotes(a) for a in merged_headers) + '\n')
for key, values in result.items():
f.write(','.join(quotes(b) for b in [key] + values) + '\n')
if __name__ == '__main__':
from argparse import ArgumentParser, FileType
from time import time
parser = ArgumentParser()
parser.add_argument('files', nargs='*', type=FileType('r'))
args = parser.parse_args()
start_time = time()
merge_csv_files(args.files)
print(time() - start_time)
I'm sure this code could be optimized even further but sometimes just seeing another way to solve a problem can help spark new ideas.

Python winreg looping through sub-keys

I'm able to successfully retrieve the 5 sub-keys from my windows 7 machine registry hive "HKEY_LOCAL_MACHINE" with the code below.
from _winreg import *
try:
i = 0
while True:
subkey = EnumKey(HKEY_LOCAL_MACHINE, i)
print subkey
i += 1
except WindowsError:
pass
My question is, how do I then enumerate the keys under those? I want to end up listing all the keys in the SOFTWARE\Microsoft\Windows NT\CurrentVersion\NetworkList\Signatures\Unmanaged folder but I can't figure out how to step my way down there.
In response to the first comment, I ran this code on my machine and while it didn't error out, it didn't produce results.
from _winreg import *
aReg = ConnectRegistry(None,HKEY_LOCAL_MACHINE)
aKey = OpenKey(aReg, r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\NetworkList\Signatures\Unmanaged")
for i in range(1024):
try:
keyname = EnumKey(aKey, i)
asubkey = OpenKey(aKey, keyname)
val = QueryValueEx(asubkey, "Description")
print val
except WindowsError:
break
A regedit or reg query shows 6 values in that folder but I can't get a python script to show me those six.
Just want to add a perhaps more pythonic solution.
from _winreg import *
from contextlib import suppress
import itertools
def subkeys(path, hkey=HKEY_LOCAL_MACHINE, flags=0):
with suppress(WindowsError), OpenKey(hkey, path, 0, KEY_READ|flags) as k:
for i in itertools.count():
yield EnumKey(k, i)
You can now access the keys as expected
for key in subkeys(r'path\to\your\key'):
print key
For python versions < 3.4 that lack suppress(), I recommend adding it to your project:
from contextlib import contextmanager
#contextmanager
def suppress(*exceptions):
try:
yield
except exceptions:
pass
Note: If you have trouble reading some values you might be reading from the wrong registry view. Pass KEY_WOW64_64KEY or KEY_WOW64_32KEY to the flags parameter). Using OpenKey() as context manager was introduced in python 2.6.
Does something like this work?
import _winreg
def subkeys(key):
i = 0
while True:
try:
subkey = _winreg.EnumKey(key, i)
yield subkey
i+=1
except WindowsError:
break
def traverse_registry_tree(key=_winreg.HKEY_LOCAL_MACHINE, tabs=0):
for k in subkeys(key):
print '\t'*tabs + str(k)
traverse_registry_tree(k, tabs+1)
I don't have the same registry keys to search but the following code will list all the subkeys in HKEY_LOCAL_MACHINE\Software. I think if you change the value of the keyVal string to your directory it will work.
The try ... except bloc is this way because EnumKey will fail. I didn't do it as a for loop because I dont know how to get the correct length of aKey.
keyVal = r"Software"
aKey = OpenKey(HKEY_LOCAL_MACHINE, keyVal, 0, KEY_ALL_ACCESS)
try:
i = 0
while True:
asubkey = EnumKey(aKey, i)
print(asubkey)
i += 1
except WindowsError:
pass
This works, and prints out the list of all subkeys (fixed version of #Broseph's answer)
import _winreg
def subkeys(key):
i = 0
while True:
try:
subkey = _winreg.EnumKey(key, i)
yield subkey
i+=1
except WindowsError as e:
break
def traverse_registry_tree(hkey, keypath, tabs=0):
key = _winreg.OpenKey(hkey, keypath, 0, _winreg.KEY_READ)
for subkeyname in subkeys(key):
print '\t'*tabs + subkeyname
subkeypath = "%s\\%s" % (keypath, subkeyname)
traverse_registry_tree(hkey, subkeypath, tabs+1)
keypath = r"SOFTWARE\\Microsoft\\Windows"
traverse_registry_tree(_winreg.HKEY_LOCAL_MACHINE, keypath)
For iterating through keys of Windows registry, you would need EnumKey() from _winreg module. Given below is the definition for EnumKey() :-
def EnumKey(key, index):
Enumerates subkeys of an open registry key.
key is an already open key, or any one of the predefined HKEY_* constants.
index is an integer that identifies the index of the key to retrieve.
Note that this method, takes index as an argument, and will provide you the key only for the given index. Therefore, in order to get all the keys, you need to increment the index by one and continue until you encounter WindowsError.
Refer to this post for a detailed understanding on the same. The Github link for the code can be found in the post.
this is just an improvement on #sparrowt's answer. His answer failed when I tried it, but think it was on the right track. This will give tree below keypath. Sorry for changing some variable and method names in rewrite, but just trying to post this quick. In my code, I put hkey as global variable bc I expect to only be working with one hkey at a time.
import winreg
import itertools
hkey = winreg.HKEY_LOCAL_MACHINE
def list_subkeys(path, hkey_set=None):
global hkey
if not hkey_set:
hkey_set = hkey
try:
registry_key = winreg.OpenKey(hkey_set, path, 0, winreg.KEY_READ)
for i in itertools.count():
yield winreg.EnumKey(registry_key, i)
winreg.CloseKey(registry_key)
except Exception as ex:
if str(ex) != '[WinError 259] No more data is available':
print(str(ex))
try:
winreg.CloseKey(registry_key)
except:
pass
return []
def traverse_registry_tree(keypath, levels=None, hkey_set=None):
global hkey
if not levels:
levels = []
if not hkey_set:
hkey_set = hkey
subkeys = list(list_subkeys(keypath, hkey_set))
for subkeyname in subkeys:
subkeypath = "%s\\%s" % (keypath, subkeyname)
levels.append(subkeypath)
levels.extend([r for r in traverse_registry_tree(subkeypath, levels, hkey_set) if r not in levels])
del subkeys
return levels
I made this code. It prints the whole subtree of a key. But first, I would like to say that the following code does not use the original naming:
[key]
|-[key]
| └-[value1: data]
|-[value1: data]
└-[value2: data]
I use the following scheme
[folder]
|-[subfolder]
| └-[key1: value]
|-[key1: value]
└-[key2: value]
import winreg
def padding(cnt):
str = ""
for i in range(cnt):
str += '\t'
return str
def recursive_visit(folder, i):
subfolders_count, subkeys_count, modified = winreg.QueryInfoKey(folder)
for subfolder_index in range(subfolders_count):
try:
subfolder_name = winreg.EnumKey(folder, subfolder_index)
print(padding(i)+subfolder_name+":")
with winreg.OpenKeyEx(folder, subfolder_name) as subfolder:
recursive_visit(subfolder, i+1)
except (WindowsError, KeyError, ValueError):
print("Error reading " + folder)
for subkey_index in range(subkeys_count):
print(padding(i)+str(winreg.EnumValue(folder, subkey_index)))
globals()["cnt"] += 1
cnt = 0 #how many keys we visited
### ENTER INPUT HERE
root = winreg.HKEY_LOCAL_MACHINE
folder = r"SYSTEM\Setup\FirstBoot\Services"
###
with winreg.OpenKey(root, folder) as reg:
keys_count, values_count, modified = winreg.QueryInfoKey(reg)
print("Subfolders: " + str(keys_count) + " Subkeys: " + str(values_count))
recursive_visit(reg, 1)
print("visited " + str(cnt) + " leaf keys")
This gives you the following output:
Subkeys: 72 Subvalues: 1
AdobeARMservice:
('ServiceName', 'AdobeARMservice', 1)
('Path', '"C:\\Program Files (x86)\\Common Files\\Adobe\\ARM\\1.0\\armsvc.exe"', 1)
('Path.Org', '"C:\\Program Files (x86)\\Common Files\\Adobe\\ARM\\1.0\\armsvc.exe"', 1)
('Path.Win32', 'C:\\Program Files (x86)\\Common Files\\Adobe\\ARM\\1.0\\armsvc.exe', 1)
('StartName', 'LocalSystem', 1)
('DisplayName', 'Adobe Acrobat Update Service', 1)
('Type', 16, 4)
('StartType', 2, 4)
('ErrorControl', 0, 4)
('LoadOrderGroup', '', 1)
('TagId', 0, 4)
AdobeUpdateService:
('ServiceName', 'AdobeUpdateService', 1)
...

Successive multiprocessing

I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.

Categories

Resources