multi thread memory error issue with infinite while loop - python

I am having infinite loop waiting for messages, once i receive messages for example 30 seconds messages accumulated and dividing into chunks and processing it in parallel.
def process_data(data, i):
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
while true:
threads = []
for n in range(len(num_of_chunks)):
t = threading.Thread(target=process_data, args=(num_of_chunks[n], n))
threads.append(t)
t.start()
# Stop the threads
for x in threads:
t.join()
But, leading into MemoryError after few iterations.
Anything to be updated to avoid memory issue and work smoothly
Even i tried below code,
import multiprocessing
from faker import Faker
# Create Faker object to generate fake data for Producer
fake = Faker()
def myrandomdata(i,j):
return fake.random_int(min = 1, max = j)
def divide_chunks(l, n):
small_msgs = []
for i in range(0, len(l), n):
small_msgs.append(l[i:i + n])
return small_msgs
def process_data(data, i):
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
rnd = myrandomdata(1, 2000)
name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
if __name__ == "__main__":
while true:
#sample data
msgs = [{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]}]
#msgs are nothing but bulk data recieving from server continuously am appending to msgs
chunk_msgs = divide_chunks(msgs, 3)
#clearing msgs to append next data after chunking previous data
msgs.clear()
with multiprocessing.Pool(len(chunk_msgs)) as pool:
pool.starmap(process_data, [(chunk_msgs[n],n) for n in range(len(chunk_msgs))])
no luck :(

You should be creating the pool only once to avoid the repetitive creation and destruction of processes.
Assuming your processing is CPU-intensive, you should be creating a pool whose size is the number of CPU cores you have. Let's call this n_cores. Then you should split your msgs list into n_cores chunks where each chunk has approximately len(msgs) // n_cores messages. Your current divide_chunks method's n argument determines how many elements are in each chunk but it would be more convenient for it to specify the total number of chunks and let it figure out how many elements needs to be in each chunk.
import multiprocessing
def divide_chunks(iterable, n):
if type(iterable) is range and iterable.step != 1:
# algorithm doesn't work with steps other than 1:
iterable = list(iterable)
l = len(iterable)
n = min(l, n)
k, m = divmod(l, n)
return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
def process_data(chunk_number, msg_chunk):
#process data for chunks
try:
for msg in msg_chunk:
# data processing here according to my requirement
# it may take 20-25 seconds of process that is why am planning for parallel
# processing
...
except Exception as e:
print("exception", e)
return True
if __name__ == "__main__": # only imports and function/class defs before this line.
n_cores = multiprocessing.cpu_count()
with multiprocessing.Pool(n_cores) as pool:
while True:
# Process next list of messages:
msgs = [...]
chunks = divide_chunks(msgs, n_cores)
msgs.clear()
results = pool.starmap(process_data, enumerate(chunks))
Update to Use Multithreading
Read all the comments in the code and make sure you understand them!!!
from multiprocessing.pool import ThreadPool
from threading import Lock
def divide_chunks(iterable, n):
if type(iterable) is range and iterable.step != 1:
# algorithm doesn't work with steps other than 1:
iterable = list(iterable)
l = len(iterable)
n = min(l, n)
k, m = divmod(l, n)
return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
FILE_NO = 1
lock = Lock()
# What purpose does argument i serve? As long as you know ...
def process_data(i, msgs): # arguments must be in this order
global FILE_NO
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
# Get next file number
# Do not use random number generator:
with lock:
file_no = FILE_NO
FILE_NO += 1
name = f"sample_{file_no}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
if __name__ == "__main__": # only imports and function/class defs before this line.
# The number of chunks you want msgs split into
# (this will be the number of files created for each invocation of process_data)
# For now I wll assume a fixed value of 10. If this is not true, then set
# POOL_SIZE to be what you think the maximum number of chunks you will have.
# But note this: depending upon where you are creating your files, writing more than
# one concurrently could hurt performance. This would be the case if you were, for example,
# writing to a non-solid state drive.
# Or recompute N_CHUNKS on each iteration based on size
# of msgs:
N_CHUNKS = 10
POOL_SIZE = N_CHUNKS
with ThreadPool(POOL_SIZE) as pool:
while True:
# Process next list of messages:
msgs = [...]
chunks = divide_chunks(msgs, N_CHUNKS)
msgs.clear()
results = pool.starmap(process_data, enumerate(chunks))

Related

How would I limit the number of characters per line in python from an input

Would there be a way to limit the amount of characters that are printed per line?
while 1:
user_message = ""
messageQ = input("""\nDo you want to enter a message?
[1] Yes
[2] No
[>] Select an option: """)
if messageQ == "1":
message = True
elif messageQ == "2":
message = False
else:
continue
if message == True:
print(
"""
-----------------------------------------------------------------
You can enter a custom message that is below 50 characters.
""")
custom_message = input("""\nPlease enter your custom message:\n \n> """)
if len(custom_message) > 50:
print("[!] Only 50 characters allowed")
continue
else:
print(f"""
Your Custom message is:
{custom_message}""") #here is where I need to limit the number of characters per line to 25
break
So where I print it here:
Your Custom message is:
{custom_message}""") #here is where I need to limit the number of characters per line to 25
I need to limit the output to 25 characters per line.
You can do
message = "More than 25 characters in this message!"
print(f"{message:.25}")
Output
More than 25 characters i
You might use textwrap.fill to break excessively long string into lines, example usage
import textwrap
message = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
print(textwrap.fill(message, 25))
output
Lorem ipsum dolor sit
amet, consectetur
adipiscing elit, sed do
eiusmod tempor incididunt
ut labore et dolore magna
aliqua. Ut enim ad minim
veniam, quis nostrud
exercitation ullamco
laboris nisi ut aliquip
ex ea commodo consequat.
Duis aute irure dolor in
reprehenderit in
voluptate velit esse
cillum dolore eu fugiat
nulla pariatur. Excepteur
sint occaecat cupidatat
non proident, sunt in
culpa qui officia
deserunt mollit anim id
est laborum.
>>> my_str = """This is a really long message that is longer than 25 characters"""
#For 25 characters TOTAL
>>> print(f"This is your custom message: {my_str}"[:25])
'This is your custom messa'
#For 25 characters in custom message
>>> print(f"This is your custom message: {my_str[:25]}")
This is your custom message: This is a really long mes
This takes advantage of the substring operator. This cuts off any characters past the 25th character.
As have already checked that the message is not more than 50 characters we just need to know whether it is more or less than 25 characters long.
ln = len(custom_message) -1 # because strings are 0 indexed
if ln < 25:
print(custom_message)
else:
print(f"This is your custom message: {my_str}"[:ln])
print(f"This is your custom message: {my_str}"[25:ln])
``

Struggling with reading a text file and use of nested loops

I have attempted making a program that counts the number of occurrences of "[AB]" in a text file by searching each file individually (after loading and opening the file of course) but it doesn't seem to work, and I have no idea why.
Here is the program:
# NOTE: to make it work try making more functions that return values and check if
# for the beginning and end of the names
# to deal with the issue of local variable scope
#imports and reads first line of text file
print("Opening and closing file")
print("\nReading characters from file.")
text_file = open("chat3.txt", "r")
#prints current line just for checking(can remove later)
x = 0
ABcount = 0
d = 0
length = len(text_file.readlines())
print("There are no of lines ", length)
line = text_file.readline()
print("the current line is ", line)
#loop to find most commonly used words( a tuple with word(string): no of occurences(int))
print("point 1(before loop 1)")
for d in range(0, length):
print("point 2(just into loop 1)")
c = text_file.readline()#reads one line and stores it in variable c as a string
count = len(c)#gets the length of line/no of characters in it as the next loop will iterate for each one
print(c)
print("point 3(in loop 1 after printing current line)")
for x in range(0, count):
print("This is count number", x+1)
c2 = c[x]
print("Current char is ", c2)
if(('[' in c) and (c2 == '[')):
start = c.index('[') + 1
end = c.index(':')
ABcount += 1
print("There is/are ", ABcount, c[start:end])
elif ( not '[' in c):
break
text_file.close()
And chat3.txt content's are:
nn an an [AB:2020]
[AB]
[AB]
And the results from comp + running are
PS C:\Users\test> python counter.py
Opening and closing file
Reading characters from file.
There are no of lines 3
the current line is
point 1(before loop 1)
point 2(just into loop 1)
point 3(in loop 1 after printing current line)
point 2(just into loop 1)
point 3(in loop 1 after printing current line)
point 2(just into loop 1)
point 3(in loop 1 after printing current line)
PS C:\Users\test>
Use regex for this kind of thing
t.txt
Deserunt velit ipsum quis id aliquip commodo deserunt nulla officia ea dolor reprehenderit pariatur. Sit laboris culpa in non et. Do laborum aliqua sunt voluptate occaecat anim magna eu. Est tempor ad non consectetur ea reprehenderit est quis et. Culpa eu sit amet est ullamco eiusmod et sit excepteur et cupidatat ullamco consectetur Lorem. Dolore elit dolore proident consectetur ipsum non. Sunt veniam incididunt duis veniam dolor sunt fugiat irure eiusmod.
Nulla eiusmod voluptate aute tempor amet aliquip ad culpa dolor labore consequat ut ea proident. Qui minim velit elit ut excepteur fugiat nisi esse do et sit. Consequat est pariatur officia incididunt et pariatur laborum aute veniam do adipisicing.
Eu aliqua ex ex irure. Mollit adipisicing est id quis eiusmod aliqua ullamco cupidatat. Lorem ea esse magna aliqua aute occaecat. Velit in enim ut ad eu magna amet fugiat labore amet ea.
Adipisicing duis enim tempor ipsum magna duis. Consectetur ullamco adipisicing est aute fugiat qui excepteur nostrud nisi laboris ipsum. Officia sunt eiusmod consectetur dolor do et adipisicing duis cillum. Adipisicing esse exercitation deserunt labore Lorem deserunt consectetur ad laboris anim sit veniam ex ea. Minim voluptate pariatur dolor adipisicing commodo voluptate consectetur aute id officia irure elit. Cillum eiusmod esse nulla enim nostrud mollit voluptate incididunt ullamco anim cillum officia.
script
with open('r.txt','r') as file:
f=file.read()
import re
re.findall('ab',f)
print(re.findall('ab',f))
# ['ab', 'ab', 'ab', 'ab', 'ab', 'ab', 'ab', 'ab']
To answer your question, it does not enter your loop because when you first call readlines, it set the cursor at the end of the file and so the next readline returns nothing. This might help: Why the second time I run "readlines" on the same file nothing is returned?
If you want to loop a file line by line just do for line in file:
For the rest, as suggested in other answers there are most certainly better way to do this, but I believe it is not the question here.

RMarkdown: knitr::purl() on Python code chunk?

I want to export my Python code chunk in RMarkdown to an external file. knitr::purl() achieves this, but I am only able to make it work on R code chunks. Does it not work for any other language than R?
For example, from below, export the python code into a my_script.py file.
---
title: "Untitled"
output: html_document
---
## Header
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip
```{python}
x = 10
y = 20
z = x + y
print(z)
```
Currently purl outputs non-R code commented out. So we need to redefine output function to override this.
Here is a simple script that (1) outputs python code only, and (2) strips documentation (I took the function from knitr source and hacked it):
library("knitr")
# New processing functions
process_tangle <- function (x) {
UseMethod("process_tangle", x)
}
process_tangle.block <- function (x) {
params = opts_chunk$merge(x$params)
# Suppress any code but python
if (params$engine != 'python') {
params$purl <- FALSE
}
if (isFALSE(params$purl))
return("")
label = params$label
ev = params$eval
code = if (!isFALSE(ev) && !is.null(params$child)) {
cmds = lapply(sc_split(params$child), knit_child)
one_string(unlist(cmds))
}
else knit_code$get(label)
if (!isFALSE(ev) && length(code) && any(grepl("read_chunk\\(.+\\)",
code))) {
eval(parse_only(unlist(stringr::str_extract_all(code,
"read_chunk\\(([^)]+)\\)"))))
}
code = knitr:::parse_chunk(code)
if (isFALSE(ev))
code = knitr:::comment_out(code, params$comment, newline = FALSE)
# Output only the code, no documentation
return(knitr:::one_string(code))
}
# Reassign functions
assignInNamespace("process_tangle.block",
process_tangle.block,
ns="knitr")
# Purl
purl("tmp.Rmd", output="tmp.py")
Here is my tmp.Rmd file. Note that it has an R chunk, which I do not want in the result:
---
title: "Untitled"
output: html_document
---
## Header
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip
```{python}
#!/usr/bin/env python
# A python script
```
```{python}
x = 10
y = 20
z = x + y
print(z)
```
```{r}
y=5
y
```
Running Rscript extract.R I get tmp.py:
#!/usr/bin/env python
# A python script
x = 10
y = 20
z = x + y
print(z)
PS I found this question searching for the solution to the same problem. Since nobody answered it, I developed my own solution :)

Python Grabbing String in between characters

If I have a string like /Hello how are you/, how am I supposed to grab this line and delete it using a python script.
import sys
import re
i_file = sys.argv[1];
def stripwhite(text):
lst = text.split('"')
for i, item in enumerate(lst):
if not i % 2:
lst[i] = re.sub("\s+", "", item)
return '"'.join(lst)
with open(i_file) as i_file_comment_strip:
i_files_names = i_file_comment_strip.readlines()
for line in i_files_names:
with open(line, "w") as i_file_data:
i_file_comment = i_file_data.readlines();
for line in i_file_comment:
i_file_comment_data = i_file_comment.strip()
In the i_file_comment I have the lines from i_file_data and i_file_comment contains the lines with the "/.../" format. Would I use a for loop through each character in the line and replace every one of those characters with a ""?
If you want to remove the /Hello how are you/ you can use regex:
import re
x = 'some text /Hello how are you/ some more text'
print (re.sub(r'/.*/','', x))
Output:
some text some more text
If you know you have occurences of a fixed string in your lines, you can simply do
for line in i_file_comment:
line = line.replace('/Hello how are you/', '')
however, if what you have is multiple occurences of strings delimited by / (i.e. /foo/, /bar/), I think using a simple regex will sufice:
>>> import re
>>> regex = re.compile(r'\/[\w\s]+\/')
>>> s = """
... Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
... /Hello how are you/ ++ tempor incididunt ut labore et dolore magna aliqua.
... /Hello world/ -- ullamco laboris nisi ut aliquip ex ea commodo
... """
>>> print re.sub(regex, '', s) # find substrings matching the regex, replace them with '' on string s
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
++ tempor incididunt ut labore et dolore magna aliqua.
-- ullamco laboris nisi ut aliquip ex ea commodo
>>>
just adjust the regex to what you need to get rid of :)

get words from large file, using low memory in python

I need to iterate over the words in a file. The file could be very big (over 1TB), the lines could be very long (maybe just one line). Words are English, so reasonable in size. So I don't want to load in the whole file or even a whole line.
I have some code that works, but may explode if lines are to long (longer than ~3GB on my machine).
def words(file):
for line in file:
words=re.split("\W+", line)
for w in words:
word=w.lower()
if word != '': yield word
Can you tell be how I can, simply, rewrite this iterator function so that it does not hold more than needed in memory?
Don't read line by line, read in buffered chunks instead:
import re
def words(file, buffersize=2048):
buffer = ''
for chunk in iter(lambda: file.read(buffersize), ''):
words = re.split("\W+", buffer + chunk)
buffer = words.pop() # partial word at end of chunk or empty
for word in (w.lower() for w in words if w):
yield word
if buffer:
yield buffer.lower()
I'm using the callable-and-sentinel version of the iter() function to handle reading from the file until file.read() returns an empty string; I prefer this form over a while loop.
If you are using Python 3.3 or newer, you can use generator delegation here:
def words(file, buffersize=2048):
buffer = ''
for chunk in iter(lambda: file.read(buffersize), ''):
words = re.split("\W+", buffer + chunk)
buffer = words.pop() # partial word at end of chunk or empty
yield from (w.lower() for w in words if w)
if buffer:
yield buffer.lower()
Demo using a small chunk size to demonstrate this all works as expected:
>>> demo = StringIO('''\
... Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque in nulla nec mi laoreet tempus non id nisl. Aliquam dictum justo ut volutpat cursus. Proin dictum nunc eu dictum pulvinar. Vestibulum elementum urna sapien, non commodo felis faucibus id. Curabitur
... ''')
>>> for word in words(demo, 32):
... print word
...
lorem
ipsum
dolor
sit
amet
consectetur
adipiscing
elit
pellentesque
in
nulla
nec
mi
laoreet
tempus
non
id
nisl
aliquam
dictum
justo
ut
volutpat
cursus
proin
dictum
nunc
eu
dictum
pulvinar
vestibulum
elementum
urna
sapien
non
commodo
felis
faucibus
id
curabitur

Categories

Resources