multi thread memory error issue with infinite while loop - python
I am having infinite loop waiting for messages, once i receive messages for example 30 seconds messages accumulated and dividing into chunks and processing it in parallel.
def process_data(data, i):
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
while true:
threads = []
for n in range(len(num_of_chunks)):
t = threading.Thread(target=process_data, args=(num_of_chunks[n], n))
threads.append(t)
t.start()
# Stop the threads
for x in threads:
t.join()
But, leading into MemoryError after few iterations.
Anything to be updated to avoid memory issue and work smoothly
Even i tried below code,
import multiprocessing
from faker import Faker
# Create Faker object to generate fake data for Producer
fake = Faker()
def myrandomdata(i,j):
return fake.random_int(min = 1, max = j)
def divide_chunks(l, n):
small_msgs = []
for i in range(0, len(l), n):
small_msgs.append(l[i:i + n])
return small_msgs
def process_data(data, i):
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
rnd = myrandomdata(1, 2000)
name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
if __name__ == "__main__":
while true:
#sample data
msgs = [{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]}]
#msgs are nothing but bulk data recieving from server continuously am appending to msgs
chunk_msgs = divide_chunks(msgs, 3)
#clearing msgs to append next data after chunking previous data
msgs.clear()
with multiprocessing.Pool(len(chunk_msgs)) as pool:
pool.starmap(process_data, [(chunk_msgs[n],n) for n in range(len(chunk_msgs))])
no luck :(
You should be creating the pool only once to avoid the repetitive creation and destruction of processes.
Assuming your processing is CPU-intensive, you should be creating a pool whose size is the number of CPU cores you have. Let's call this n_cores. Then you should split your msgs list into n_cores chunks where each chunk has approximately len(msgs) // n_cores messages. Your current divide_chunks method's n argument determines how many elements are in each chunk but it would be more convenient for it to specify the total number of chunks and let it figure out how many elements needs to be in each chunk.
import multiprocessing
def divide_chunks(iterable, n):
if type(iterable) is range and iterable.step != 1:
# algorithm doesn't work with steps other than 1:
iterable = list(iterable)
l = len(iterable)
n = min(l, n)
k, m = divmod(l, n)
return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
def process_data(chunk_number, msg_chunk):
#process data for chunks
try:
for msg in msg_chunk:
# data processing here according to my requirement
# it may take 20-25 seconds of process that is why am planning for parallel
# processing
...
except Exception as e:
print("exception", e)
return True
if __name__ == "__main__": # only imports and function/class defs before this line.
n_cores = multiprocessing.cpu_count()
with multiprocessing.Pool(n_cores) as pool:
while True:
# Process next list of messages:
msgs = [...]
chunks = divide_chunks(msgs, n_cores)
msgs.clear()
results = pool.starmap(process_data, enumerate(chunks))
Update to Use Multithreading
Read all the comments in the code and make sure you understand them!!!
from multiprocessing.pool import ThreadPool
from threading import Lock
def divide_chunks(iterable, n):
if type(iterable) is range and iterable.step != 1:
# algorithm doesn't work with steps other than 1:
iterable = list(iterable)
l = len(iterable)
n = min(l, n)
k, m = divmod(l, n)
return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
FILE_NO = 1
lock = Lock()
# What purpose does argument i serve? As long as you know ...
def process_data(i, msgs): # arguments must be in this order
global FILE_NO
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
# Get next file number
# Do not use random number generator:
with lock:
file_no = FILE_NO
FILE_NO += 1
name = f"sample_{file_no}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
if __name__ == "__main__": # only imports and function/class defs before this line.
# The number of chunks you want msgs split into
# (this will be the number of files created for each invocation of process_data)
# For now I wll assume a fixed value of 10. If this is not true, then set
# POOL_SIZE to be what you think the maximum number of chunks you will have.
# But note this: depending upon where you are creating your files, writing more than
# one concurrently could hurt performance. This would be the case if you were, for example,
# writing to a non-solid state drive.
# Or recompute N_CHUNKS on each iteration based on size
# of msgs:
N_CHUNKS = 10
POOL_SIZE = N_CHUNKS
with ThreadPool(POOL_SIZE) as pool:
while True:
# Process next list of messages:
msgs = [...]
chunks = divide_chunks(msgs, N_CHUNKS)
msgs.clear()
results = pool.starmap(process_data, enumerate(chunks))
Related
How would I limit the number of characters per line in python from an input
Would there be a way to limit the amount of characters that are printed per line? while 1: user_message = "" messageQ = input("""\nDo you want to enter a message? [1] Yes [2] No [>] Select an option: """) if messageQ == "1": message = True elif messageQ == "2": message = False else: continue if message == True: print( """ ----------------------------------------------------------------- You can enter a custom message that is below 50 characters. """) custom_message = input("""\nPlease enter your custom message:\n \n> """) if len(custom_message) > 50: print("[!] Only 50 characters allowed") continue else: print(f""" Your Custom message is: {custom_message}""") #here is where I need to limit the number of characters per line to 25 break So where I print it here: Your Custom message is: {custom_message}""") #here is where I need to limit the number of characters per line to 25 I need to limit the output to 25 characters per line.
You can do message = "More than 25 characters in this message!" print(f"{message:.25}") Output More than 25 characters i
You might use textwrap.fill to break excessively long string into lines, example usage import textwrap message = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." print(textwrap.fill(message, 25)) output Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
>>> my_str = """This is a really long message that is longer than 25 characters""" #For 25 characters TOTAL >>> print(f"This is your custom message: {my_str}"[:25]) 'This is your custom messa' #For 25 characters in custom message >>> print(f"This is your custom message: {my_str[:25]}") This is your custom message: This is a really long mes This takes advantage of the substring operator. This cuts off any characters past the 25th character.
As have already checked that the message is not more than 50 characters we just need to know whether it is more or less than 25 characters long. ln = len(custom_message) -1 # because strings are 0 indexed if ln < 25: print(custom_message) else: print(f"This is your custom message: {my_str}"[:ln]) print(f"This is your custom message: {my_str}"[25:ln]) ``
Struggling with reading a text file and use of nested loops
I have attempted making a program that counts the number of occurrences of "[AB]" in a text file by searching each file individually (after loading and opening the file of course) but it doesn't seem to work, and I have no idea why. Here is the program: # NOTE: to make it work try making more functions that return values and check if # for the beginning and end of the names # to deal with the issue of local variable scope #imports and reads first line of text file print("Opening and closing file") print("\nReading characters from file.") text_file = open("chat3.txt", "r") #prints current line just for checking(can remove later) x = 0 ABcount = 0 d = 0 length = len(text_file.readlines()) print("There are no of lines ", length) line = text_file.readline() print("the current line is ", line) #loop to find most commonly used words( a tuple with word(string): no of occurences(int)) print("point 1(before loop 1)") for d in range(0, length): print("point 2(just into loop 1)") c = text_file.readline()#reads one line and stores it in variable c as a string count = len(c)#gets the length of line/no of characters in it as the next loop will iterate for each one print(c) print("point 3(in loop 1 after printing current line)") for x in range(0, count): print("This is count number", x+1) c2 = c[x] print("Current char is ", c2) if(('[' in c) and (c2 == '[')): start = c.index('[') + 1 end = c.index(':') ABcount += 1 print("There is/are ", ABcount, c[start:end]) elif ( not '[' in c): break text_file.close() And chat3.txt content's are: nn an an [AB:2020] [AB] [AB] And the results from comp + running are PS C:\Users\test> python counter.py Opening and closing file Reading characters from file. There are no of lines 3 the current line is point 1(before loop 1) point 2(just into loop 1) point 3(in loop 1 after printing current line) point 2(just into loop 1) point 3(in loop 1 after printing current line) point 2(just into loop 1) point 3(in loop 1 after printing current line) PS C:\Users\test>
Use regex for this kind of thing t.txt Deserunt velit ipsum quis id aliquip commodo deserunt nulla officia ea dolor reprehenderit pariatur. Sit laboris culpa in non et. Do laborum aliqua sunt voluptate occaecat anim magna eu. Est tempor ad non consectetur ea reprehenderit est quis et. Culpa eu sit amet est ullamco eiusmod et sit excepteur et cupidatat ullamco consectetur Lorem. Dolore elit dolore proident consectetur ipsum non. Sunt veniam incididunt duis veniam dolor sunt fugiat irure eiusmod. Nulla eiusmod voluptate aute tempor amet aliquip ad culpa dolor labore consequat ut ea proident. Qui minim velit elit ut excepteur fugiat nisi esse do et sit. Consequat est pariatur officia incididunt et pariatur laborum aute veniam do adipisicing. Eu aliqua ex ex irure. Mollit adipisicing est id quis eiusmod aliqua ullamco cupidatat. Lorem ea esse magna aliqua aute occaecat. Velit in enim ut ad eu magna amet fugiat labore amet ea. Adipisicing duis enim tempor ipsum magna duis. Consectetur ullamco adipisicing est aute fugiat qui excepteur nostrud nisi laboris ipsum. Officia sunt eiusmod consectetur dolor do et adipisicing duis cillum. Adipisicing esse exercitation deserunt labore Lorem deserunt consectetur ad laboris anim sit veniam ex ea. Minim voluptate pariatur dolor adipisicing commodo voluptate consectetur aute id officia irure elit. Cillum eiusmod esse nulla enim nostrud mollit voluptate incididunt ullamco anim cillum officia. script with open('r.txt','r') as file: f=file.read() import re re.findall('ab',f) print(re.findall('ab',f)) # ['ab', 'ab', 'ab', 'ab', 'ab', 'ab', 'ab', 'ab']
To answer your question, it does not enter your loop because when you first call readlines, it set the cursor at the end of the file and so the next readline returns nothing. This might help: Why the second time I run "readlines" on the same file nothing is returned? If you want to loop a file line by line just do for line in file: For the rest, as suggested in other answers there are most certainly better way to do this, but I believe it is not the question here.
RMarkdown: knitr::purl() on Python code chunk?
I want to export my Python code chunk in RMarkdown to an external file. knitr::purl() achieves this, but I am only able to make it work on R code chunks. Does it not work for any other language than R? For example, from below, export the python code into a my_script.py file. --- title: "Untitled" output: html_document --- ## Header Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ```{python} x = 10 y = 20 z = x + y print(z) ```
Currently purl outputs non-R code commented out. So we need to redefine output function to override this. Here is a simple script that (1) outputs python code only, and (2) strips documentation (I took the function from knitr source and hacked it): library("knitr") # New processing functions process_tangle <- function (x) { UseMethod("process_tangle", x) } process_tangle.block <- function (x) { params = opts_chunk$merge(x$params) # Suppress any code but python if (params$engine != 'python') { params$purl <- FALSE } if (isFALSE(params$purl)) return("") label = params$label ev = params$eval code = if (!isFALSE(ev) && !is.null(params$child)) { cmds = lapply(sc_split(params$child), knit_child) one_string(unlist(cmds)) } else knit_code$get(label) if (!isFALSE(ev) && length(code) && any(grepl("read_chunk\\(.+\\)", code))) { eval(parse_only(unlist(stringr::str_extract_all(code, "read_chunk\\(([^)]+)\\)")))) } code = knitr:::parse_chunk(code) if (isFALSE(ev)) code = knitr:::comment_out(code, params$comment, newline = FALSE) # Output only the code, no documentation return(knitr:::one_string(code)) } # Reassign functions assignInNamespace("process_tangle.block", process_tangle.block, ns="knitr") # Purl purl("tmp.Rmd", output="tmp.py") Here is my tmp.Rmd file. Note that it has an R chunk, which I do not want in the result: --- title: "Untitled" output: html_document --- ## Header Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ```{python} #!/usr/bin/env python # A python script ``` ```{python} x = 10 y = 20 z = x + y print(z) ``` ```{r} y=5 y ``` Running Rscript extract.R I get tmp.py: #!/usr/bin/env python # A python script x = 10 y = 20 z = x + y print(z) PS I found this question searching for the solution to the same problem. Since nobody answered it, I developed my own solution :)
Python Grabbing String in between characters
If I have a string like /Hello how are you/, how am I supposed to grab this line and delete it using a python script. import sys import re i_file = sys.argv[1]; def stripwhite(text): lst = text.split('"') for i, item in enumerate(lst): if not i % 2: lst[i] = re.sub("\s+", "", item) return '"'.join(lst) with open(i_file) as i_file_comment_strip: i_files_names = i_file_comment_strip.readlines() for line in i_files_names: with open(line, "w") as i_file_data: i_file_comment = i_file_data.readlines(); for line in i_file_comment: i_file_comment_data = i_file_comment.strip() In the i_file_comment I have the lines from i_file_data and i_file_comment contains the lines with the "/.../" format. Would I use a for loop through each character in the line and replace every one of those characters with a ""?
If you want to remove the /Hello how are you/ you can use regex: import re x = 'some text /Hello how are you/ some more text' print (re.sub(r'/.*/','', x)) Output: some text some more text
If you know you have occurences of a fixed string in your lines, you can simply do for line in i_file_comment: line = line.replace('/Hello how are you/', '') however, if what you have is multiple occurences of strings delimited by / (i.e. /foo/, /bar/), I think using a simple regex will sufice: >>> import re >>> regex = re.compile(r'\/[\w\s]+\/') >>> s = """ ... Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod ... /Hello how are you/ ++ tempor incididunt ut labore et dolore magna aliqua. ... /Hello world/ -- ullamco laboris nisi ut aliquip ex ea commodo ... """ >>> print re.sub(regex, '', s) # find substrings matching the regex, replace them with '' on string s Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod ++ tempor incididunt ut labore et dolore magna aliqua. -- ullamco laboris nisi ut aliquip ex ea commodo >>> just adjust the regex to what you need to get rid of :)
get words from large file, using low memory in python
I need to iterate over the words in a file. The file could be very big (over 1TB), the lines could be very long (maybe just one line). Words are English, so reasonable in size. So I don't want to load in the whole file or even a whole line. I have some code that works, but may explode if lines are to long (longer than ~3GB on my machine). def words(file): for line in file: words=re.split("\W+", line) for w in words: word=w.lower() if word != '': yield word Can you tell be how I can, simply, rewrite this iterator function so that it does not hold more than needed in memory?
Don't read line by line, read in buffered chunks instead: import re def words(file, buffersize=2048): buffer = '' for chunk in iter(lambda: file.read(buffersize), ''): words = re.split("\W+", buffer + chunk) buffer = words.pop() # partial word at end of chunk or empty for word in (w.lower() for w in words if w): yield word if buffer: yield buffer.lower() I'm using the callable-and-sentinel version of the iter() function to handle reading from the file until file.read() returns an empty string; I prefer this form over a while loop. If you are using Python 3.3 or newer, you can use generator delegation here: def words(file, buffersize=2048): buffer = '' for chunk in iter(lambda: file.read(buffersize), ''): words = re.split("\W+", buffer + chunk) buffer = words.pop() # partial word at end of chunk or empty yield from (w.lower() for w in words if w) if buffer: yield buffer.lower() Demo using a small chunk size to demonstrate this all works as expected: >>> demo = StringIO('''\ ... Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque in nulla nec mi laoreet tempus non id nisl. Aliquam dictum justo ut volutpat cursus. Proin dictum nunc eu dictum pulvinar. Vestibulum elementum urna sapien, non commodo felis faucibus id. Curabitur ... ''') >>> for word in words(demo, 32): ... print word ... lorem ipsum dolor sit amet consectetur adipiscing elit pellentesque in nulla nec mi laoreet tempus non id nisl aliquam dictum justo ut volutpat cursus proin dictum nunc eu dictum pulvinar vestibulum elementum urna sapien non commodo felis faucibus id curabitur