When running a data streaming sub-process against a multiprocessing, after printing a single element in the dictionary program freezes,
#!/usr/bin/python
import subprocess,time, timeit
from multiprocessing import Process, Queue
import re, os, pprint, math
from collections import defaultdict
Dict = {}
count = defaultdict(int)
queueVar = Queue()
def __ReadRX__(RX_info):
lines = iter(RX_info.stdout.readline, "")
try:
start = time.clock()
for line in lines:
if re.match(r"^\d+.*$",line):
splitline = line.split()
del splitline[1:4]
identifier = splitline[1]
count[identifier] += 1
end = time.clock()
timing = round((end - start) * 10000, 100)
dlc = splitline[2]
hexbits = splitline[3:]
Dict[identifier] = [dlc, hexbits, count[identifier],int(timing)]
start = end
for identifier,hexbits in Dict.items():
queueVar.put(Dict)
except KeyboardInterrupt:
pass
procRX = subprocess.Popen('receivetest -f=/dev/pcan32'.split(), stdout=subprocess.PIPE)
if __name__ == '__main__':
munchCan = Process(target=__ReadRX__, args=(procRX,))
munchCan.start()
#munchCan.join()
printDict = queueVar.get()
for i in range(len(printDict)):
print printDict
I know if if I print from __ReadRX__ it prints a constant stream however when trying to print from outside of the function I only get a single entry in the dictionary.
add the following at the top:
from time import sleep
then after the print statement add:
sleep(1)
this will make the script wait for 1 second. You can adjust that number as necessary.
Related
I'm a hobby coder started with AHK, then some java and now I try to learn Python. I have searched and found some tips but I have yet not been able to implement it into my own code.
Hopefully someone here can help me, it's a very short program.
I'm using .txt csv database with ";" as a separator.
DATABASE EXAMPLE:
Which color is normally a cat?;Black
How tall was the longest man on earth?;272 cm
Is the earth round?;Yes
The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core).
If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. Perhaps also I can separate the database into 4 pieces?
The code right now looks like this! Not more then 65 lines and its doing its job (but to slow). Advice on how I can make this process into multi core needed.
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
return db
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
def top_answers(db, question):
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def main():
try:
db = load_db()
last_db_reload = time.time()
while True:
# Get contents of clipboard
question = pp.paste()
# Rank answer
top = top_answers(db, question)
# If answer was found, show results
if len(top) > 0:
write_txt(top)
time.sleep(fall_back_time)
except:
print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
if name == 'main':
main()'
If you could divide the db into four equally large you could process them in parallel like this:
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def worker(thread_id, question):
thread_id = str(thread_id)
db = pd.read_csv(db_file_path + thread_id, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
top = db_sorted
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar" + thread_id + ".txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
return
def main():
question = pp.paste()
for i in range(1, 4):
t = threading.Thread(target=worker, args=(i, question))
t.start()
t.join()
if name == 'main':
main()
The solution with multiprocessing:
import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np
# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce
import sys
import os
from contextlib import closing
ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
chunked_db = []
NUM_PROCESSES = os.cpu_count()
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db.columns = ['question', 'answer']
#db = db.drop_duplicates() # i drop it for experiment
break
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
# split database into equal chunks:
# (if you have a lot of RAM, otherwise you
# need to compute ranges in db, something like
# chunk_size = len(db)//NUM_PROCESSES
# ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
# and pass ranges in original db to processes
chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
return chunked_db
def top_answers_multiprocessed(question, chunked_db):
# on unix, python uses 'fork' mode by default
# so the process has 'copy-on-change' access to all global variables
# i.e. if process will change something in db, it will be copied to it
# with a lot of overhead
# Unfortunately, I'fe heard that on Windows only 'spawn' mode with full
# copy of everything is used
# Process pipeline uses pickle, it's quite slow.
# so on small database you may not have benefit from multiprocessing
# If you are going to transfer big objects in or out, look
# in the direction of multiprocessing.Array
# this solution is not fully efficient,
# as pool is recreated each time
# You can create daemon processes which will monitor
# Queue for incoming questions, but it's harder to implement
def top_answers(idx):
# question is in the scope of parent function,
chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
with closing(Pool(processes=NUM_PROCESSES)) as pool:
# chunked_db is a list of databases
# they are in global scope, we send only index beacause
# all the data set is pickled
num_chunks = len(chunked_db)
# apply function top_answers across generator range(num_chunks)
res = pool.imap_unordered(top_answers, range(num_chunks))
res = list(res)
# now res is list of dataframes, let's join it
res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
return res_final
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def mainfunc():
global chunked_db
chunked_db = load_db()
last_db_reload = time.time()
print('db loaded')
last_clip = ""
while True:
# Get contents of clipboard
try:
new_clip = pp.paste()
except:
continue
if (new_clip != last_clip) and (len(new_clip)> 0):
print(new_clip)
last_clip = new_clip
question = new_clip.strip()
else:
continue
# Rank answer
top = top_answers_multiprocessed(question, chunked_db)
# If answer was found, show results
if len(top) > 0:
#write_txt(top)
print(top)
if __name__ == '__main__':
mainfunc()
I may be approaching this all wrong but still this is where I'm at. I have very large log files I'm trying to search, up to 30gb in some cases. I'm writing a script to pull info and have been playing with multi process to speed it up a bit. right now I'm testing running two functions at the same time to search from the top and bottom to get results, which seems to work. I'm wondering if it's possible to stop one function one a result from the other. Such as if the top function finds a result they both stop. This way I can build it out as needed.
from file_read_backwards import FileReadBackwards
from multiprocessing import Process
import sys
z = "log.log"
#!/usr/bin/env python
rocket = 0
def top():
target = "test"
with open(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
def bottom():
target = "text"
with FileReadBackwards(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
if __name__=='__main__':
p1 = Process(target = top)
p1.start()
p2 = Process(target = bottom)
p2.start()
Here's a proof-of-concept of the approach I mentioned in the comments:
import os
import random
import sys
from multiprocessing import Process, Value
def search(proc_no, file_name, seek_to, max_size, find, flag):
stop_at = seek_to + max_size
with open(file_name) as f:
if seek_to:
f.seek(seek_to - 1)
prev_char = f.read(1)
if prev_char != '\n':
# Landed in the middle of a line. Skip back one (or
# maybe more) lines so this line isn't excluded. Start
# by seeking back 256 bytes, then 512 if necessary, etc.
exponent = 8
pos = seek_to
while pos >= seek_to:
pos = f.seek(max(0, pos - (2 ** exponent)))
f.readline()
pos = f.tell()
exponent += 1
while True:
if flag.value:
break
line = f.readline()
if not line:
break # EOF
data = line.strip()
if data == find:
flag.value = proc_no
print(data)
break
if f.tell() > stop_at:
break
if __name__ == '__main__':
# list.txt contains lines with the numbers 1 to 1000001
file_name = 'list.txt'
info = os.stat(file_name)
file_size = info.st_size
if len(sys.argv) == 1:
# Pick a random value from list.txt
num_lines = 1000001
choices = list(range(1, num_lines + 1))
choices.append('XXX')
find = str(random.choice(choices))
else:
find = sys.argv[1]
num_procs = 4
chunk_size, remainder = divmod(file_size, num_procs)
max_size = chunk_size + remainder
flag = Value('i', 0)
procs = []
print(f'Using {num_procs} processes to look for {find} in {file_name}')
for i in range(num_procs):
seek_to = i * chunk_size
proc = Process(target=search, args=(i + 1, file_name, seek_to, max_size, find, flag))
procs.append(proc)
for proc in procs:
proc.start()
for proc in procs:
proc.join()
if flag.value:
print(find, 'found by proc', flag.value)
else:
print(find, 'not found')
After reading various posts[1] about reading files with multiprocessing and multithreading, it seems that neither is a great approach due to potential disk thrashing and serialized reads. So here's a different, simpler approach that is way faster (at least for the file with a million lines I was trying it out on):
import mmap
import sys
def search_file(file_name, text, encoding='utf-8'):
text = text.encode(encoding)
with open(file_name) as f:
with mmap.mmap(f.fileno(), 0, flags=mmap.ACCESS_READ, prot=mmap.PROT_READ) as m:
index = m.find(text)
if index > -1:
# Found a match; now find beginning of line that
# contains match so we can grab the whole line.
while index > 0:
index -= 1
if m[index] == 10:
index += 1
break
else:
index = 0
m.seek(index)
line = m.readline()
return line.decode(encoding)
if __name__ == '__main__':
file_name, search_string = sys.argv[1:]
line = search_file(file_name, search_string)
sys.stdout.write(line if line is not None else f'Not found in {file_name}: {search_string}\n')
I'm curious how this would perform with a 30GB log file.
[1] Including this one
Simple example using a multiprocessing.Pool and callback function.
Terminates remaining pool processes once a result has returned.
You could add an arbitrary number of processes to search from different offsets in the file using this approach.
import math
import time
from multiprocessing import Pool
from random import random
def search(pid, wait):
"""Sleep for wait seconds, return PID
"""
time.sleep(wait)
return pid
def done(result):
"""Do something with result and stop other processes
"""
print("Process: %d done." % result)
pool.terminate()
print("Terminate Pool")
pool = Pool(2)
pool.apply_async(search, (1, math.ceil(random() * 3)), callback=done)
pool.apply_async(search, (2, math.ceil(random() * 3)), callback=done)
# do other stuff ...
# Wait for result
pool.close()
pool.join() # block our main thread
This is essentially the same as Blurp's answer, but I shortened it and made it a bit to make it more general. As you can see top should be an infinite loop, but bottom stops top immediately.
from multiprocessing import Process
valNotFound = True
def top():
i=0
while ValNotFound:
i += 1
def bottom():
ValNotFound = False
p1 = Process(target = top)
p2 = Process(target = bottom)
p1.start()
p2.start()
Not sure if it's possible, but I was hoping to do something where I can print a hyphen for the width of the terminal on one line. If the window's width is resized, the amount of hyphens displayed would print accordingly.
This is a more elaborated version that allows printing whatever you want always according to the dimension of the terminal. You can also resize the terminal while nothing is being printed and the content will be resized accordingly.
I commented the code a little bit... but if you need I can be more explicit.
#!/usr/bin/env python2
import threading
import Queue
import time
import sys
import subprocess
from backports.shutil_get_terminal_size import get_terminal_size
printq = Queue.Queue()
interrupt = False
lines = []
def main():
ptt = threading.Thread(target=printer) # Turn the printer on
ptt.daemon = True
ptt.start()
# Stupid example of stuff to print
for i in xrange(1,100):
printq.put(' '.join([str(x) for x in range(1,i)])) # The actual way to send stuff to the printer
time.sleep(.5)
def split_line(line, cols):
if len(line) > cols:
new_line = ''
ww = line.split()
i = 0
while len(new_line) <= (cols - len(ww[i]) - 1):
new_line += ww[i] + ' '
i += 1
print len(new_line)
if new_line == '':
return (line, '')
return (new_line, ' '.join(ww[i:]))
else:
return (line, '')
def printer():
while True:
cols, rows = get_terminal_size() # Get the terminal dimensions
msg = '#' + '-' * (cols - 2) + '#\n' # Create the
try:
new_line = str(printq.get_nowait())
if new_line != '!##EXIT##!': # A nice way to turn the printer
# thread out gracefully
lines.append(new_line)
printq.task_done()
else:
printq.task_done()
sys.exit()
except Queue.Empty:
pass
# Build the new message to show and split too long lines
for line in lines:
res = line # The following is to split lines which are
# longer than cols.
while len(res) !=0:
toprint, res = split_line(res, cols)
msg += '\n' + toprint
# Clear the shell and print the new output
subprocess.check_call('clear') # Keep the shell clean
sys.stdout.write(msg)
sys.stdout.flush()
time.sleep(.5)
if __name__ == '__main__':
main()
Check this out:(it worked on windows and python3 )
import os
os.system('mode con: cols=100 lines=40')
input("Press any key to continue...")
os.system('mode con: cols=1000 lines=400')
input("Press any key to continue...")
This is doing exactly what you asked for... with a very small issue: when you make the shell smaller the cursor goes down of one line and the stuff that is above will stay there.... I can try to solve this issue... but the result will be more complicated.
I assumed you are using a unix system.
The code uses threads to be able to keep the line on the screen while doing other things. In this case just sleeping... Moreover, only using a thread is actually possible to have a "fast" answer to the change of the dimension of the terminal.
#!/usr/bin/env python2
import threading
import time
import sys
from backports.shutil_get_terminal_size import get_terminal_size
def main1():
ptt = threading.Thread(target=printer2)
ptt.daemon = True
ptt.start()
time.sleep(10)
def printer2():
while True:
cols, rows = get_terminal_size()
line = '-' * (cols - 2)
sys.stdout.write("\r" + '#' + line + '#')
sys.stdout.flush()
time.sleep(.5)
I'm currently writing a script that reads reddit comments from a large file (5 gigs compressed, ~30 gigs of data being read). My script reads the comments, checks for some text, parses them, and sends them off to a Queue function (running in a seperate thread). No matter what I do, I always get a MemoryError on a specific iteration (number 8162735 if it matters in the slightest). And I can't seem to handle the error, Windows just keeps shutting down python when it hits. Here's my script:
import ujson
from tqdm import tqdm
import bz2
import json
import threading
import spacy
import Queue
import time
nlp = spacy.load('en')
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for i, line in (enumerate(file_)):
yield ujson.loads(line)['body']
objects = iter_comments('RC_2015-01.bz2')
q = Queue.Queue()
f = open("reddit_dump.bin", 'wb')
def worker():
while True:
item = q.get()
f.write(item)
q.task_done()
for i in range(0, 2):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
def finish_parse(comment):
global q
try:
comment_parse = nlp(unicode(comment))
comment_bytes = comment_parse.to_bytes()
q.put(comment_bytes)
except MemoryError:
print "MemoryError with comment {0}, waiting for Queue to empty".format(comment)
time.sleep(2)
except AssertionError:
print "AssertionError with comment {0}, skipping".format(comment)
for comment in tqdm(objects):
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
q.join()
f.close()
Does anybody know what I'm doing wrong?
Looks like its not in your code but may be in the data. Have you tried to skip that iteration?
x = 0
for comment in tqdm(objects):
x += 1
if x != 8162735
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
I have a script that parses xml files using the ElementTree Path Evaluator. It works fine as it is, but it takes a long for it to finish. So I tried to make a multithreaded implementation:
import fnmatch
import operator
import os
import lxml.etree
from nltk import FreqDist
from nltk.corpus import stopwords
from collections import defaultdict
from datetime import datetime
import threading
import Queue
STOPWORDS = stopwords.words('dutch')
STOPWORDS.extend(stopwords.words('english'))
DIR_NAME = 'A_DIRNAME'
PATTERN = '*.A_PATTERN'
def loadData(dir_name, pattern):
nohyphen_files = []
dir_names = []
dir_paths = []
for root, dirnames, filenames in os.walk(dir_name):
dir_names.append(dirnames)
dir_paths.append(root)
for filename in fnmatch.filter(filenames, pattern):
nohyphen_files.append(os.path.join(root, filename))
return nohyphen_files, dir_names, dir_paths
def freq(element_list, descending = True):
agglomerated = defaultdict(int)
for e in element_list:
agglomerated[e] += 1
return sorted(agglomerated.items(), key=operator.itemgetter(1), reverse=descending)
def lexDiv(amount_words):
return 1.0*len(set(amount_words))/len(amount_words)
def anotherFreq(list_types, list_words):
fd = FreqDist(list_types)
print 'top 10 most frequent types:'
for t, freq in fd.items()[:10]:
print t, freq
print '\ntop 10 most frequent words:'
agglomerated = defaultdict(int)
for w in list_words:
if not w.lower() in STOPWORDS:
agglomerated[w] += 1
sorted_dict = sorted(agglomerated.items(), key=operator.itemgetter(1),reverse=True)
print sorted_dict[:10]
def extractor(f):
print "check file: {}".format(f)
try:
# doc = lxml.etree.ElementTree(lxml.etree.XML(f))
doc = lxml.etree.ElementTree(file=f)
except lxml.etree.XMLSyntaxError, e:
print e
return
doc_evaluator = lxml.etree.XPathEvaluator(doc)
entities = doc_evaluator('//entity/*/externalRef/#reference')
places_dbpedia = doc_evaluator('//entity[contains(#type, "Schema:Place")]/*/externalRef/#reference')
non_people_dbpedia = set(doc_evaluator('//entity[not(contains(#type, "Schema:Person"))]'))
people = doc_evaluator('//entity[contains(#type, "Schema:Person")]/*/externalRef/#reference')
words = doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
unique_words = set(words)
other_tokens = doc.xpath('text/wf[re:match(text(), "[^A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
amount_of_sentences = doc_evaluator('text/wf/#sent')[-1]
types = doc_evaluator('//term/#morphofeat')
longest_sentence = freq(doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/#sent',\
namespaces={"re": "http://exslt.org/regular-expressions"}))[0]
top_people = freq([e.split('/')[-1] for e in people])[:10]
top_entities = freq([e.split('/')[-1] for e in entities])[:10]
top_places = freq([e.split('/')[-1] for e in places_dbpedia])[:10]
def worker():
while 1:
job_number = q.get()
extractor(job_number)
q.task_done() #this thread is complete, move on
if __name__ =='__main__':
startTime = datetime.now()
files, dirs, path = loadData(DIR_NAME, PATTERN)
startTime = datetime.now()
q = Queue.Queue()# job queue
for f in files:
q.put(f)
for i in range(20): #make 20 workerthreads ready
worker_thread = threading.Thread(target=worker)
worker_thread.daemon = True
worker_thread.start()
q.join()
print datetime.now() - startTime
This does something, but when timing it, it isn't faster than the normal version. I think it has something to do with opening and reading files making the threader not multithreaded. If I use a function that instead of parsing the xml file just sleeps for a couple of second and prints something, it does work and it is a lot faster. What do I have to account for to have a multithreaded XML parser?
Threading in Python doesn't work as it does in other languages. It relies on the Global Interpreter Lock that makes sure only one thread is active at one time (running bytecode to be exact).
What you want to do is use the multiprocess library, instead.
You can read more about the GIL and Threading here:
https://docs.python.org/2/glossary.html#term-global-interpreter-lock
https://docs.python.org/2/library/threading.html