Python array text sorting - python

For some context behind this project, I have a file with over 500 text documents that look like the following example:
Case: 44951651558
Sex: M
Age: 50
COLLECTED: 07/26/2019
REPORTED: 07/29/2019
SURGICAL PATHOLOGY REPORT
DIAGNOSIS
A. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco
laboris nisi ut aliquip ex ea commodo consequat.
This is the structure of the txt files that I have. I have the task of taking these text files and assigning the given values to variables such as case, sex, age, collected, reported, and diagnosis. Here is the code that I currently have :
import os
directory = "E:\\Tubular Adenoma\\"
keywords = ["Case: ",
"Sex: ",
"Age: ",
"COLLECTED: ",
"REPORTED: ",
"DIAGNOSIS"]
for filename in os.listdir(directory):
if filename.endswith(".txt"):
lines = []
with open(directory + filename, "rt") as file:
for line in file:
lines.append(line)
print(lines)
else:
continue
I am attempting to assign the values in the text files to keywords as given in the keywords array. I believe that this is going to be a dead-end and there is probably an easier way to do this. Anything helps.
The desired output is an array that contains the elements listed in the keywords array. For example, in the example project the array would look like the following:
["44951651558", "M", "50", "07/26/2019", "07/29/2019", "A. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod..."]

I made it work with other files and it will now return a list of lists.
import os
directory = "E:\\Tubular Adenoma\\"
# changed keywords list to dictionary to store the total characters to remove
# when applying slices on line #21
keywords = {'Case:': 6, 'Sex:': 5, 'Age:': 5, 'COLLECTED:': 12, \
'REPORTED:': 11}
entities = []
for filename in os.listdir(directory):
if filename.endswith(".txt"):
lines = []
with open(directory + filename, "rt") as file:
for line in file:
# added .strip() to remove all \n after lines
lines.append(line.strip())
# Added a list to store all the values
info = []
for l in lines:
for k, v in keywords.items():
if l.find(k) == 0:
info.append(l[v:])
diagnoses = ''
# Gets index of DIAGNOSIS and adds 2 to remove the empty line
slice_value = lines.index("DIAGNOSIS") + 2
# Loops over every line after DIAGNOSIS in the lines list and appends them to
# a string called diagnoses.
for l in lines[slice_value:]:
diagnoses += l
info.append(diagnoses)
entities.append(info)
for e in entities:
print(e)

Related

multi thread memory error issue with infinite while loop

I am having infinite loop waiting for messages, once i receive messages for example 30 seconds messages accumulated and dividing into chunks and processing it in parallel.
def process_data(data, i):
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
while true:
threads = []
for n in range(len(num_of_chunks)):
t = threading.Thread(target=process_data, args=(num_of_chunks[n], n))
threads.append(t)
t.start()
# Stop the threads
for x in threads:
t.join()
But, leading into MemoryError after few iterations.
Anything to be updated to avoid memory issue and work smoothly
Even i tried below code,
import multiprocessing
from faker import Faker
# Create Faker object to generate fake data for Producer
fake = Faker()
def myrandomdata(i,j):
return fake.random_int(min = 1, max = j)
def divide_chunks(l, n):
small_msgs = []
for i in range(0, len(l), n):
small_msgs.append(l[i:i + n])
return small_msgs
def process_data(data, i):
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
rnd = myrandomdata(1, 2000)
name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
if __name__ == "__main__":
while true:
#sample data
msgs = [{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]}]
#msgs are nothing but bulk data recieving from server continuously am appending to msgs
chunk_msgs = divide_chunks(msgs, 3)
#clearing msgs to append next data after chunking previous data
msgs.clear()
with multiprocessing.Pool(len(chunk_msgs)) as pool:
pool.starmap(process_data, [(chunk_msgs[n],n) for n in range(len(chunk_msgs))])
no luck :(
You should be creating the pool only once to avoid the repetitive creation and destruction of processes.
Assuming your processing is CPU-intensive, you should be creating a pool whose size is the number of CPU cores you have. Let's call this n_cores. Then you should split your msgs list into n_cores chunks where each chunk has approximately len(msgs) // n_cores messages. Your current divide_chunks method's n argument determines how many elements are in each chunk but it would be more convenient for it to specify the total number of chunks and let it figure out how many elements needs to be in each chunk.
import multiprocessing
def divide_chunks(iterable, n):
if type(iterable) is range and iterable.step != 1:
# algorithm doesn't work with steps other than 1:
iterable = list(iterable)
l = len(iterable)
n = min(l, n)
k, m = divmod(l, n)
return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
def process_data(chunk_number, msg_chunk):
#process data for chunks
try:
for msg in msg_chunk:
# data processing here according to my requirement
# it may take 20-25 seconds of process that is why am planning for parallel
# processing
...
except Exception as e:
print("exception", e)
return True
if __name__ == "__main__": # only imports and function/class defs before this line.
n_cores = multiprocessing.cpu_count()
with multiprocessing.Pool(n_cores) as pool:
while True:
# Process next list of messages:
msgs = [...]
chunks = divide_chunks(msgs, n_cores)
msgs.clear()
results = pool.starmap(process_data, enumerate(chunks))
Update to Use Multithreading
Read all the comments in the code and make sure you understand them!!!
from multiprocessing.pool import ThreadPool
from threading import Lock
def divide_chunks(iterable, n):
if type(iterable) is range and iterable.step != 1:
# algorithm doesn't work with steps other than 1:
iterable = list(iterable)
l = len(iterable)
n = min(l, n)
k, m = divmod(l, n)
return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
FILE_NO = 1
lock = Lock()
# What purpose does argument i serve? As long as you know ...
def process_data(i, msgs): # arguments must be in this order
global FILE_NO
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
# Get next file number
# Do not use random number generator:
with lock:
file_no = FILE_NO
FILE_NO += 1
name = f"sample_{file_no}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
if __name__ == "__main__": # only imports and function/class defs before this line.
# The number of chunks you want msgs split into
# (this will be the number of files created for each invocation of process_data)
# For now I wll assume a fixed value of 10. If this is not true, then set
# POOL_SIZE to be what you think the maximum number of chunks you will have.
# But note this: depending upon where you are creating your files, writing more than
# one concurrently could hurt performance. This would be the case if you were, for example,
# writing to a non-solid state drive.
# Or recompute N_CHUNKS on each iteration based on size
# of msgs:
N_CHUNKS = 10
POOL_SIZE = N_CHUNKS
with ThreadPool(POOL_SIZE) as pool:
while True:
# Process next list of messages:
msgs = [...]
chunks = divide_chunks(msgs, N_CHUNKS)
msgs.clear()
results = pool.starmap(process_data, enumerate(chunks))

Python how to split messages

In discord chat there is a limit of 2000 characters per message so is there any way to bypass it?
like example in below code when someone types !ping bot sends a embed message. So is it possible to make it split message after or before a certain line bot hides that messages and gives option to view or click next page or something.
#bot.command(pass_context=True)
async def ping(ctx):
embed=discord.Embed(title="Something Title", description="something anything goes here")
await bot.say(embed=embed)
You can split your text yourself or use the easy way as suggestend py #Prashant Godhani here and use the textwrap.wrap() function:
# easy way
import textwrap
import lorem
def sayLongLine(text, wrap_at=200):
for line in textwrap.wrap(text, wrap_at):
# use await bot.say - maybe add a delay if you have max says/second
print(line)
sayLongLine(lorem.paragraph(), 40)
If you'd rather replicate the functionality of the textwrap module yourself you can do so by splitting your text at spaces into words and combining the words until they would overshoot the length you are allowed to use. Put that word in the next sentence, join all current words back together and store it in a list. Loop until done, add last parts if needed and return the list:
# slightly more complex self-made wrapper:
import lorem
print("----------------------")
def sayLongLineSplitted(text,wrap_at=200):
"""Splits text at spaces and joins it to strings that are as long as
possible without overshooting wrap_at.
Returns a list of strings shorter then wrap_at."""
splitted = text.split(" ")
def gimme():
"""Yields sentences of correct lenght."""
len_parts = 0
parts = []
for p in splitted:
len_p = len(p)
if len_parts + len_p < wrap_at:
parts.append(p)
len_parts += len_p + 1
else:
yield ' '.join(parts).strip()
parts = [p]
len_parts = len_p
if parts:
yield ' '.join(parts).strip()
return list(gimme())
for part in sayLongLineSplitted(lorem.paragraph(),40):
print(part)
Output of self-made wrapper:
# 234567890123456789012345678901234567890
Ut velit magnam sed sed. Eius modi
quiquia numquam. Quaerat eius tempora
tempora consectetur etincidunt est. Sit
dolor quaerat quaerat amet voluptatem
dolorem dolore. Sit adipisci non
etincidunt est aliquam etincidunt sit.
Quaerat porro sed sit.
Output of textwrap-example:
# 234567890123456789012345678901234567890
Etincidunt aliquam etincidunt velit
numquam. Quisquam porro labore velit.
Modi modi porro quaerat dolor etincidunt
quisquam. Ut ipsum quiquia non quisquam
magnam ut sit. Voluptatem non non
dolorem. Tempora quaerat neque quaerat
dolorem velit magnam ipsum.

Python .replace() function not working correctly

I'm trying to figure out why the .replace function in python isn't functioning correctly. I have spent the entire day yesterday searching for an answer but alas have not found one.
I'm trying to open and read a file, copy it into a list, count the number of lines in the list and remove all the punctuation (ie , . ! ? etc). I can do everything except remove the punctuation (and I must use the .replace function instead of importing a module).
with open('Small_text_file.txt', 'r') as myFile: #adding lines from file to list
contents = myFile.readlines()
fileList= []
# punctuation = ['(', ')', '?', ':', ';', ',', '.', '!', '/', '"', "'"]
for i in contents:
fileList.append(i.rstrip())
print('The Statistics are:\n','Number of lines:', len(fileList)) #first part of question
for item in fileList:
fileList = item.replace(',', "")
fileList = item.replace('.', "")
print(fileList)
The "Small text file" is:
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Vivamus condimentum sagittis lacus? laoreet luctus ligula laoreet ut.
Vestibulum ullamcorper accumsan velit vel vehicula?
Proin tempor lacus arcu. Nunc at elit condimentum, semper nisi et, condimentum mi.
In venenatis blandit nibh at sollicitudin. Vestibulum dapibus mauris at orci maximus pellentesque.
Nullam id elementum ipsum. Suspendisse
Running the code returns the following:
The Statistics are:
Number of lines: 6
Nullam id elementum ipsum Suspendisse
So the code DOES remove the comma and period characters but it also removes the preceding 5 lines of the text and only prints the very last line. What am I doing wrong here?
Use enumerate:
for x, item in enumerate(fileList):
fileList[x] = item.replace(',', "").replace('.', "")
Note: item.replace() returns replaced string which you need to store in the right index of list. enumerate helps you keep track of index while iterating through the list.
It should be
for i,item in enumerate(fileList):
fileList[i] = item.replace(',', "").replace('.', "")
Without enumerate,
for i in range(len(fileList)):
fileList[i] = fileList[i].replace(',', "").replace('.', "")

Python Grabbing String in between characters

If I have a string like /Hello how are you/, how am I supposed to grab this line and delete it using a python script.
import sys
import re
i_file = sys.argv[1];
def stripwhite(text):
lst = text.split('"')
for i, item in enumerate(lst):
if not i % 2:
lst[i] = re.sub("\s+", "", item)
return '"'.join(lst)
with open(i_file) as i_file_comment_strip:
i_files_names = i_file_comment_strip.readlines()
for line in i_files_names:
with open(line, "w") as i_file_data:
i_file_comment = i_file_data.readlines();
for line in i_file_comment:
i_file_comment_data = i_file_comment.strip()
In the i_file_comment I have the lines from i_file_data and i_file_comment contains the lines with the "/.../" format. Would I use a for loop through each character in the line and replace every one of those characters with a ""?
If you want to remove the /Hello how are you/ you can use regex:
import re
x = 'some text /Hello how are you/ some more text'
print (re.sub(r'/.*/','', x))
Output:
some text some more text
If you know you have occurences of a fixed string in your lines, you can simply do
for line in i_file_comment:
line = line.replace('/Hello how are you/', '')
however, if what you have is multiple occurences of strings delimited by / (i.e. /foo/, /bar/), I think using a simple regex will sufice:
>>> import re
>>> regex = re.compile(r'\/[\w\s]+\/')
>>> s = """
... Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
... /Hello how are you/ ++ tempor incididunt ut labore et dolore magna aliqua.
... /Hello world/ -- ullamco laboris nisi ut aliquip ex ea commodo
... """
>>> print re.sub(regex, '', s) # find substrings matching the regex, replace them with '' on string s
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
++ tempor incididunt ut labore et dolore magna aliqua.
-- ullamco laboris nisi ut aliquip ex ea commodo
>>>
just adjust the regex to what you need to get rid of :)

How would one limit characters per line when printing a raw_input to a text file?

I'm attempting to write a report creating script. Simply put I have the user submit strings via a few raw_input()s. Theses strings are assigned to global variables and when they are finished I need the script to print the string but limit it to only 80chars per line. I've looked at the textwrap module and looked around for anyone else who's asked this. But i've only found people trying to limit the characters being printed within the script from a raw input or from a pre existing file and never trying to print out to a new file. Heres some code that is basically a shorter version of what im trying to do.
Here's the code:
def start():
global file_name
file_name = raw_input("\nPlease Specify a filename:\n>>> ")
print "The filename chosen is: %r" % file_name
create_report()
note_type()
def create_report():
global new_report
new_report = open(file_name, 'w')
print "Report created as: %r" % file_name
new_report.write("Rehearsal Report\n")
note_type()
def note_type():
print "\nPlease select which type of note you wish to make."
print """
1. Type1
2. Print
"""
answer = raw_input("\n>>> ")
if answer in "1 1. type1 Type1 TYPE1":
type1_note()
elif answer in "2 2. print Print PRINT":
print_notes()
else:
print "Unacceptable Response"
note_type()
def type1_note():
print "Please Enter your note:"
global t1note_text
t1note_text = raw_input(">>> ")
print "\nNote Added."
note_type()
def print_notes():
new_report.write("\nType 1: %r" % t1note_text)
new_report.close
print "Printed. Goodbye!"
exit(0)
start()
And Here is my terminal input
---
new-host-4:ism Bean$ python SO_Question.py
Please Specify a filename:
">>> " test3.txt
The filename chosen is: 'test3.txt'
Report created as: 'test3.txt'
Please select which type of note you wish to make.
1. Type1
2. Print
">>> " 1
Please Enter your note:
">>> "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam at dignissim diam. Donec aliquam consectetur pretium. Sed ac sem eu nulla tincidunt accumsan. Praesent vel velit odio. Donec porta mauris ut eros bibendum consequat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Integer adipiscing nibh in turpis placerat non interdum magna convallis. Phasellus porta mauris at nibh laoreet ac vulputate elit semper.
Note Added.
Please select which type of note you wish to make.
1. Type1
2. Print
">>> "2
Printed. Goodbye!
new-host-4:ism Bean$
The only problem being that when I open the file (test3.txt) the entire paragraph of lorem ipsum is all printed to one line. Like this:
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam at dignissim diam. Donec aliquam consectetur pretium. Sed ac sem eu nulla tincidunt accumsan. Praesent vel velit odio. Donec porta mauris ut eros bibendum consequat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Integer adipiscing nibh in turpis placerat non interdum magna convallis. Phasellus porta mauris at nibh laoreet ac vulputate elit semper.
Anybody got any advice to get textwrap to print 80chars per line to the file?
If you don't want to use any additional modules, you could split the value of your user input into 80 character chunks yourself:
def split_input(string, chunk_size):
num_chunks = len(string)/chunk_size
if (len(string) % chunk_size != 0):
num_chunks += 1
output = []
for i in range(0, num_chunks):
output.append(string[chunk_size*i:chunk_size*(i+1)])
return output
Then you could print the output list to a file:
input_chunks = split_input(user_input, 80)
for chunk in input_chunk:
outFile.write(chunk + "\n")
UPDATE:
This version will respect space-separated words:
def split_input(user_string, chunk_size):
output = []
words = user_string.split(" ")
total_length = 0
while (total_length < len(user_string) and len(words) > 0):
line = []
next_word = words[0]
line_len = len(next_word) + 1
while (line_len < chunk_size) and len(words) > 0:
words.pop(0)
line.append(next_word)
if (len(words) > 0):
next_word = words[0]
line_len += len(next_word) + 1
line = " ".join(line)
output.append(line)
total_length += len(line)
return output
In python 3, you can use textwrap.fill to print 80 characters lines :
import textwrap
print (textwrap.fill(your_text, width=80))
see https://docs.python.org/3.6/library/textwrap.html
You can try and use the Textwrap module:
from textwrap import TextWrapper
def print_notes(t1note_text):
wrapper = TextWrapper(width=80)
splittext = "\n".join(wrapper.wrap(t1note_text))
new_report.write("\nType 1: %r" % splittext)
new_report.close
print "Printed. Goodbye!"
exit(0)

Categories

Resources