I am trying to crawl abstracts from PubMed and filtering them using regex via python. To speed things up, I wanted to use pythons multiprocessing pool.
My code looks like the following:
import multiprocessing as mp
from functools import partial
from typing import List, Tuple
def beautify_abstract(abstract: str, regex: str):
import re
result: str = ""
last_start = 0
matches = re.finditer(regex, abstract, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result += abstract[last_start:match.start()]
result += "<b>"
result += abstract[match.start():match.end()]
result += "</b>"
last_start = match.end()
result += abstract[last_start:]
return result
def get_doi(pim: str, regex: str):
from Bio import Entrez
from Bio.Entrez import efetch
import re
from metapub.convert import pmid2doi
Entrez.email = "Your.Name.Here#example.org"
print(f"Processing {pim}")
abstract_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='all')
abstract = abstract_handle.read()
abstract_handle.close()
if re.search(regex, abstract, re.MULTILINE) is not None:
docsum_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='docsum').read()
docsum = docsum_handle.read()
try:
doi = pmid2doi(pim)
except:
doi = "UNKNOWN"
return f"{doi}"
return ""
def get_pim_with_regex_list(keywords: List[str]) -> List[str]:
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
searchterm = " ".join(keywords)
pims = []
handle = Entrez.esearch(db="pubmed", retstart=0, retmax=0, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
count = int(record['Count'])
if count > 100000:
retmax = 100000
else:
retmax = count
retstart = 0
while retstart < count:
handle = Entrez.esearch(db="pubmed", retstart=retstart, retmax=retmax, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
for pim in record['IdList']:
pims.append(pim)
retstart += retmax
return pims
if __name__ == '__main__':
keywords = ["keyword1", "keyword2"]
pim_list = get_pim_with_regex_list(keywords)
regex = "keyword1 keyword2"
worker_fn = partial(get_doi, regex=regex)
pool = mp.Pool(mp.cpu_count())
entries = pool.map(worker_fn, pim_list)
pool.close()
pool.join()
When I run the given code, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
Process ForkPoolWorker-4:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
I did some digging into multiprocessing with python and found out that only python native types are supported as parameters (enforced by the ForkingPickler).
Assuming that str is a native type, the code should work... Currently, I am completely lost and have no idea what may be the problem.
As suggested, I uploaded a minimal (sequential) working example here
Is there any way to fix this problem or at least diagnose the real issue here?
Related
I have an iterator that will retrive various number of lines from a very large (>20GB) file depend on some features. The iterator works fine, but I can only use 1 thread to process the result. I would like to feed the value from each iteration to multiple threads / processes.
I'm using a text file with 9 lines to mimic my data, here is my code. I've been struggling on how to create the feedback so when one process finished, it will go and retrive the next iteration:
from multiprocessing import Process, Manager
import time
# Iterator
class read_file(object):
def __init__(self, filePath):
self.file = open(filePath, 'r')
def __iter__(self):
return self
def __next__(self):
line = self.file.readline()
if line:
return line
else:
raise StopIteration
# worker for one process
def print_worker(a, n, stat):
print(a)
stat[n] = True # Set the finished status as True
return None
# main
def main():
file_path = 'tst_mp.txt' # the txt file wit 9 lines
n_worker = 2
file_handle = read_file(file_path)
workers = []
# Create shared list for store dereplicated dict and progress counter
manager = Manager()
status = manager.list([False] * 2) # list of dictonary for each thread
# Initiate the workers
for i in range(n_worker):
workers.append(Process(target=print_worker, args=(file_handle.__next__(), i, status,)))
for worker in workers:
worker.start()
block = file_handle.__next__() # The next block (line)
while block: # continue is there is still block left
print(status)
time.sleep(1) # for every second
for i in range(2):
if status[i]: # Worker i finished
workers[i].join()
# workers[i].close()
workers[i] = Process(target=print_worker, args=(block, i, status,))
status[i] = False # Set worker i as busy (False)
workers[i].start() # Start worker i
try: # try to get the next item in the iterator
block = file_handle.__next__()
except StopIteration:
block = False
if __name__ == '__main__':
main()
The code is clumsy, but it did print out the sequence, but also with some error when I ran the code twice:
1
2
3
4
5
6
7
8
9
Process Process-10:
Traceback (most recent call last):
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
conn = self._tls.connection
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/zewei/share/paf_depth/test_multiprocess.py", line 31, in print_worker
stat[n] = True # Set the finished status as True
File "<string>", line 2, in __setitem__
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
self._connect()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 794, in _connect
dispatch(conn, None, 'accept_connection', (name,))
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 90, in dispatch
kind, result = c.recv()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 255, in recv
buf = self._recv_bytes()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
buf = self._recv(4)
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 384, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Here is where I'm stucked, I was wondering if there is any fix or more elegant way for this?
Thanks!
Here's a better way to do what you are doing, using pool:
from multiprocessing import Pool
import time
.
.
.
.
# worker for one process
def print_worker(a):
print(a)
return None
def main():
file_path = r'' # the txt file wit 9 lines
n_worker = 2
file_handle = read_file(file_path)
results = []
with Pool(n_worker) as pool:
for result in pool.imap(print_worker, file_handle):
results.append(result)
print(results)
if __name__ == '__main__':
main()
Here, the imap function lazily iterates over the iterator, so that the whole file won't be read into memory. Pool handles spreading the tasks across the number of processes you started (using n_worker) automatically so that you don't have to manage it yourself.
I'm trying to convert text into a bunch of skipgrams with the help of Keras's Tokenizer. I have code that works for single sentences, but not for batches of sentences. Namely, when I try to run the method at the very bottom, I get an error message:
> Traceback (most recent call last): File
> "/Users/montana/Documents/Capstone project/skipgrams.py", line 61, in
> word2vec_nn_generator
> question_word_targets,question_word_contexts = sequences_to_skipgrams(question_sequences) File
> "/Users/montana/miniconda3/lib/python3.6/site-packages/numpy/lib/function_base.py",
> line 1972, in __call__
> return self._vectorize_call(func=func, args=vargs) File "/Users/montana/miniconda3/lib/python3.6/site-packages/numpy/lib/function_base.py",
> line 2042, in _vectorize_call
> ufunc, otypes = self._get_ufunc_and_otypes(func=func, args=args) File
> "/Users/montana/miniconda3/lib/python3.6/site-packages/numpy/lib/function_base.py",
> line 2002, in _get_ufunc_and_otypes
> outputs = func(*inputs) File "/Users/montana/Documents/Capstone project/skipgrams.py", line 54, in <lambda>
> sequences_to_skipgrams = np.vectorize(lambda x: sequence_to_skipgrams(x,3,len(textTokenizer.word_index) + 1)) File
> "/Users/montana/Documents/Capstone project/skipgrams.py", line 48, in
> sequence_to_skipgrams
> couples, labels = skipgrams(data, vocab_size, window_size=window_size) File
> "/Users/montana/miniconda3/lib/python3.6/site-packages/keras_preprocessing/sequence.py",
> line 197, in skipgrams
> for i, wi in enumerate(sequence): TypeError: 'numpy.int32' object is not iterable
>
> During handling of the above exception, another exception occurred:
>
> Traceback (most recent call last): File
> "/Users/montana/Documents/Capstone project/skipgrams.py", line 72, in
> <module>
> for i in word2vec_nn_generator(questionsTokenized_train,contextTokenized_train,trainingData["answer_start"],1):
> File "/Users/montana/Documents/Capstone project/skipgrams.py", line
> 65, in word2vec_nn_generator
> raise ValueError("Exception in word2vec_nn_generator.") ValueError: Exception in word2vec_nn_generator. logout Saving
> session... ...copying shared history... ...saving history...truncating
> history files... ...completed.
>
> [Process completed]
What exactly is this error message indicating, and how can I fix it?
import json
import numpy as np
import pandas as pd
import os
assert os.path.isfile("train-v1.1.json"),"Non-existent file"
from tensorflow.python.client import device_lib
import tensorflow.compat.v1 as tf
#import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import re
regex = re.compile(r'\W+')
#Reading the files.
def readFile(filename):
with open(filename) as file:
fields = []
JSON = json.loads(file.read())
articles = []
for article in JSON["data"]:
articleTitle = article["title"]
article_body = []
for paragraph in article["paragraphs"]:
paragraphContext = paragraph["context"]
article_body.append(paragraphContext)
for qas in paragraph["qas"]:
question = qas["question"]
answer = qas["answers"][0]
fields.append({"question":question,"answer_text":answer["text"],"answer_start":answer["answer_start"],"paragraph_context":paragraphContext,"article_title":articleTitle})
article_body = "\\n".join(article_body)
article = {"title":articleTitle,"body":article_body}
articles.append(article)
fields = pd.DataFrame(fields)
fields["question"] = fields["question"].str.replace(regex," ")
assert not (fields["question"].str.contains("catalanswhat").any())
fields["paragraph_context"] = fields["paragraph_context"].str.replace(regex," ")
fields["answer_text"] = fields["answer_text"].str.replace(regex," ")
assert not (fields["paragraph_context"].str.contains("catalanswhat").any())
fields["article_title"] = fields["article_title"].str.replace("_"," ")
assert not (fields["article_title"].str.contains("catalanswhat").any())
return fields,JSON["data"]
trainingData,training_JSON = readFile("train-v1.1.json")
print("JSON dataset read.")
#Text preprocessing
## Converting text to skipgrams
from tensorflow.keras.preprocessing.text import *
from tensorflow.keras.preprocessing.sequence import skipgrams,make_sampling_table
def sequence_to_skipgrams(data,window_size,vocab_size):
sampling_table = make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size)
assert len(couples) > 0
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")
return word_target,word_context
sequences_to_skipgrams = np.vectorize(lambda x: sequence_to_skipgrams(x,3,len(textTokenizer.word_index) + 1))
def word2vec_nn_generator(question_sequences,context_sequences,answer_starts,batch_size):
while True:
sequence_indices = np.random.randint(0,high=question_sequences.shape[0],size=10)
question_sequences = question_sequences[sequence_indices,:]
context_sequences = context_sequences[sequence_indices,:]
try:
question_word_targets,question_word_contexts = sequences_to_skipgrams(question_sequences)
context_word_targets,context_word_contexts = sequences_to_skipgrams(context_sequences)
yield question_word_targets,question_word_contexts,context_word_targets,context_word_contexts,answer_starts
except Exception as e:
raise ValueError("Exception in word2vec_nn_generator.")
strings = trainingData.drop("answer_start",axis=1)
strings = strings.values.flatten()
textTokenizer = Tokenizer()
textTokenizer.fit_on_texts(strings)
questionsTokenized_train = pad_sequences(textTokenizer.texts_to_sequences(trainingData["question"]))
contextTokenized_train = pad_sequences(textTokenizer.texts_to_sequences(trainingData["paragraph_context"]))
for i in word2vec_nn_generator(questionsTokenized_train,contextTokenized_train,trainingData["answer_start"],1):
print(i)
break
This message indicates that tf.keras.preprocessing.sequence.skipgrams function receives a sequence argument in a wrong format.
Check the contents of the data variable in your sequence_to_skipgrams function and compare it with the required format:
sequence: A word sequence (sentence), encoded as a list
of word indices (integers). If using a `sampling_table`,
word indices are expected to match the rank
of the words in a reference dataset (e.g. 10 would encode
the 10-th most frequently occurring token).
Note that index 0 is expected to be a non-word and will be skipped.
Source
The sequence is comming from the question_sequences variable, so, most likely, something is wrong with the way you slice it in the while loop (word2vec_nn_generator function).
I found the error:
np.vectorize is not the function I wanted to use. np.vectorize assumes that it's given a function that takes a single value, such as a number.
What I wanted to do instead was np.apply_along_axis, which makes a different assumption: that the given function takes a 1-D array.
I want to scrape <p> from pages and since there will be a couple thousands of them I want to use multiprocessing. However, it doesn't work when I try to append the result to some variable
I want to append the result of scraping to the data = []
I made a url_common for a base website since some pages don't start with HTTP etc.
from tqdm import tqdm
import faster_than_requests as requests #20% faster on average in my case than urllib.request
import bs4 as bs
def scrape(link, data):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
data.append(p.text)
Above doesn't work, since map() doesn't accept function like above
I tried to use it another way:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
from multiprocessing import Pool
p = Pool(10)
links = ['link', 'other_link', 'another_link']
data = p.map(scrape, links)
I get this error while using above function:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 110, in worker
task = get()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\queues.py", line 354, in get
return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scrape' on <module '__main__' (built-in)>
I have not figured a way to do it so that it uses Pool and at the same time appending the result of scraping to the given variable
EDIT
I change a little bit to see where it stops:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.investing.com/'
else:
url_common = ''
try: #tries are always halpful with url as you never know
ht = requests.get2str(url_common + str(i))
except:
pass
print('works1')
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
print('works2')
for p in paragraphs:
print(p.text)
links = ['link', 'other_link', 'another_link']
scrape(links)
#WORKS PROPERLY AND PRINTS EVERYTHING
if __name__ == '__main__':
p = Pool(5)
print(p.map(scrape, links))
#DOESN'T WORK, NOTHING PRINTS. Error like above
You are using the map function incorrectly.
It iterates over each element of the iterable and calls the function on each element.
You can see the map function as doing something like the following:
to_be_mapped = [1, 2, 3]
mapped = []
def mapping(x): # <-- note that the mapping accepts a single value
return x**2
for item in to_be_mapped:
res = mapping(item)
mapped.append(res)
So to solve your problem remove the outermost for-loop as iterating is handled by the map function
def scrape(link):
if link[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(link))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
I am trying to write a Python model which is capable of doing some processing in a PostgreSQL database using the multi-threading module and peewee.
In single core mode the code works, however, when I try to run the code with multiple cores I am running into a SSL error.
I would like to post the structure of my model in the hope that somebody can advice how to set of my model in a proper way. Currently, I have chosen to use an object oriented approach in which I make one connection which is shared in a pool. To clarify what I have done, I will now show the source code I have so far
I have three files: main.py, models.py and parser.py. The contents is the following
models.py defines the peewee postgresql table and makes a connection to the postgres server
import peewee as pw
from playhouse.pool import PooledPostgresqlExtDatabase
KVK_KEY = "id_number"
NAME_KEY = "name"
N_VOWELS_KEY = "n_vowels"
# initialise the data base
database = PooledPostgresqlExtDatabase(
"testdb", user="postgres", host="localhost", port=5432, password="xxxx",
max_connections=8, stale_timeout=300 )
class BaseModel(pw.Model):
class Meta:
database = database
only_save_dirty = True
# this class describes the format of the sql data base
class Company(BaseModel):
id_number = pw.IntegerField(primary_key=True)
name = pw.CharField(null=True)
n_vowels = pw.IntegerField(default=-1)
processor = pw.IntegerField(default=-1)
def connect_database(database_name, reset_database=False):
""" connect the database """
database.connect()
if reset_database:
database.drop_tables([Company])
database.create_tables([Company])
parser.py contains the CompanyParser class which is used as the engine of the code to do all the processing. It generates some artificial data which is stored to the postgresql database and then the run method is used to do some processing with the data already stored in the database
import pandas as pd
import numpy as np
import random
import string
import peewee as pw
from models import (Company, database, KVK_KEY, NAME_KEY)
import multiprocessing as mp
MAX_SQL_CHUNK = 1000
np.random.seed(0)
def random_name(size=8, chars=string.ascii_lowercase):
""" Create a random character string of 'size' characters """
return "".join(random.choice(chars) for _ in range(size))
def vowel_count(characters):
"""
Count the number of vowels in the string 'characters' and return as an integer
"""
count = 0
for char in characters:
if char in list("aeiou"):
count += 1
return count
class CompanyParser(mp.Process):
def __init__(self, number_of_companies=100, i_proc=None,
number_of_procs=1,
first_id=None, last_id=None):
if i_proc is not None and number_of_procs > 1:
mp.Process.__init__(self)
self.i_proc = i_proc
self.number_of_procs = number_of_procs
self.n_companies = number_of_companies
self.data_df: pd.DataFrame = None
self.first_id = first_id
self.last_id = last_id
def generate_data(self):
""" Create a dataframe with fake company data and id's """
id_list = np.random.randint(1000000, 9999999, self.n_companies)
company_list = np.array([random_name() for _ in range(self.n_companies)])
self.data_df = pd.DataFrame(data=np.vstack([id_list, company_list]).T,
columns=[KVK_KEY, NAME_KEY])
self.data_df.sort_values([KVK_KEY], inplace=True)
def store_to_database(self):
"""
Store the company data to a sql database
"""
record_list = list(self.data_df.to_dict(orient="index").values())
n_batch = int(len(record_list) / MAX_SQL_CHUNK) + 1
with database.atomic():
for cnt, batch in enumerate(pw.chunked(record_list, MAX_SQL_CHUNK)):
print(f"writing {cnt}/{n_batch}")
Company.insert_many(batch).execute()
def run(self):
print("Making query at {}".format(self.i_proc))
query = (Company.
select().
where(Company.id_number.between(self.first_id, self.last_id)))
print("Found {} companies".format(query.count()))
for cnt, company in enumerate(query):
print("Processing # {} - {}: company {}/{}".format(self.i_proc, cnt,
company.id_number,
company.name))
number_of_vowels = vowel_count(company.name)
company.n_vowels = number_of_vowels
company.processor = self.i_proc
print(f"storing number of vowels: {number_of_vowels}")
company.save()
Finally, my main script load the class stored in the models.py and parser.py and launches the code.
from models import (Company, connect_database)
from parser import CompanyParser
number_of_processors = 2
connect_database(None, reset_database=True)
# init an object of the CompanyParser and use the create database
parser = CompanyParser()
company_ids = Company.select(Company.id_number)
parser.generate_data()
parser.store_to_database()
n_companies = company_ids.count()
n_comp_per_proc = int(n_companies / number_of_processors)
print("Found {} companies: {} per proc".format(n_companies, n_comp_per_proc))
for i_proc in range(number_of_processors):
i_start = i_proc * n_comp_per_proc
first_id = company_ids[i_start]
last_id = company_ids[i_start + n_comp_per_proc - 1]
print(f"Running proc {i_proc} for id {first_id} until id {last_id}")
sub_parser = CompanyParser(first_id=first_id, last_id=last_id,
i_proc=i_proc,
number_of_procs=number_of_processors)
if number_of_processors > 1:
sub_parser.start()
else:
sub_parser.run()
In case that the number_of_processors = 1 this script works perfectly fine. It generates artificial data, stores it to the PostgreSQL database and does some processing on the data (it counts the number of vowels in the name and stores it to the n_vowels column)
However, in case I am trying to run this with 2 cores with number_of_processors = 2, I run into the following error
/opt/miniconda3/bin/python /home/eelco/PycharmProjects/multiproc_peewee/main.py
writing 0/1
Found 100 companies: 50 per proc
Running proc 0 for id 1020737 until id 5295565
Running proc 1 for id 5302405 until id 9891087
Making query at 0
Found 50 companies
Processing # 0 - 0: company 1020737/wqrbgxiu
storing number of vowels: 2
Making query at 1
Process CompanyParser-1:
Processing # 0 - 1: company 1086107/lkbagrbc
storing number of vowels: 1
Processing # 0 - 2: company 1298367/nsdjsqio
storing number of vowels: 2
Traceback (most recent call last):
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2714, in execute_sql
cursor.execute(sql, params or ())
psycopg2.OperationalError: SSL error: sslv3 alert bad record mac
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/miniconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/eelco/PycharmProjects/multiproc_peewee/parser.py", line 82, in run
company.save()
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 5748, in save
rows = self.update(**field_dict).where(self._pk_expr()).execute()
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1696, in execute
return self._execute(database)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2121, in _execute
cursor = database.execute(self)
File "/opt/miniconda3/lib/python3.7/site-packages/playhouse/postgres_ext.py", line 468, in execute
cursor = self.execute_sql(sql, params, commit=commit)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2721, in execute_sql
self.commit()
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2512, in __exit__
reraise(new_type, new_type(*exc_args), traceback)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 186, in reraise
raise value.with_traceback(tb)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2714, in execute_sql
cursor.execute(sql, params or ())
peewee.OperationalError: SSL error: sslv3 alert bad record mac
Process CompanyParser-2:
Traceback (most recent call last):
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2714, in execute_sql
cursor.execute(sql, params or ())
psycopg2.OperationalError: SSL error: decryption failed or bad record mac
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/miniconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/eelco/PycharmProjects/multiproc_peewee/parser.py", line 72, in run
print("Found {} companies".format(query.count()))
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1881, in count
return Select([clone], [fn.COUNT(SQL('1'))]).scalar(database)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1866, in scalar
row = self.tuples().peek(database)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1853, in peek
rows = self.execute(database)[:n]
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1696, in execute
return self._execute(database)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1847, in _execute
cursor = database.execute(self)
File "/opt/miniconda3/lib/python3.7/site-packages/playhouse/postgres_ext.py", line 468, in execute
cursor = self.execute_sql(sql, params, commit=commit)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2721, in execute_sql
self.commit()
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2512, in __exit__
reraise(new_type, new_type(*exc_args), traceback)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 186, in reraise
raise value.with_traceback(tb)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2714, in execute_sql
cursor.execute(sql, params or ())
peewee.OperationalError: SSL error: decryption failed or bad record mac
Process finished with exit code 0
Somehow something goes wrong as soon as the second thread start to do something with the database. Does somebody has advice to get this code working. I have tried the following already
Try the PooledPostgresDatabase and normal PostgresqlDatabase to
connect to the database. This leads to the same error
Try using sqlite in stead of postgres. This works for 2 cores, but only if the two processes are not interfering too much; otherwise I
can some locking problems. I was in the impression that postgres
would be better for doing multiprocessing then sqlite (is that true?)
When putting a break after launching the first process(so effectively using only one core), the code works, showing that the start method is called correctly.
Hopefully somebody can advise.
Regards
Eelco
After some searching on the internet today I found the solution for my problem here:github.com/coleifer. As coleifer mentions: you apparently first have to set up all the forks before you start connecting to the database. Based on this idea I have modified my code and it is working now.
For those interested I will post my python scripts again so you can see how I did it. This because I there is not so much explicit examples out there, so perhaps it may help others.
First of all, all the database and peewee modules are now moved into initialization functions which are only called inside the constructor of the CompanyParser class.
So models.py looks like
import peewee as pw
from playhouse.pool import PooledPostgresqlExtDatabase, PostgresqlDatabase, PooledPostgresqlDatabase
KVK_KEY = "id_number"
NAME_KEY = "name"
N_VOWELS_KEY = "n_vowels"
def init_database():
db = PooledPostgresqlDatabase(
"testdb", user="postgres", host="localhost", port=5432, password="xxxxx",
max_connections=8, stale_timeout=300)
return db
def init_models(db, reset_tables=False):
class BaseModel(pw.Model):
class Meta:
database = db
# this class describes the format of the sql data base
class Company(BaseModel):
id_number = pw.IntegerField(primary_key=True)
name = pw.CharField(null=True)
n_vowels = pw.IntegerField(default=-1)
processor = pw.IntegerField(default=-1)
if db.is_closed():
db.connect()
if reset_tables and Company.table_exists():
db.drop_tables([Company])
db.create_tables([Company])
return Company
Then, the worker class CompanyParser is defined in the parser.py script and looks like this
import multiprocessing as mp
import random
import string
import numpy as np
import pandas as pd
import peewee as pw
from models import (KVK_KEY, NAME_KEY, init_database, init_models)
MAX_SQL_CHUNK = 1000
np.random.seed(0)
def random_name(size=32, chars=string.ascii_lowercase):
""" Create a random character string of 'size' characters """
return "".join(random.choice(chars) for _ in range(size))
def vowel_count(characters):
"""
Count the number of vowels in the string 'characters' and return as an integer
"""
count = 0
for char in characters:
if char in list("aeiou"):
count += 1
return count
class CompanyParser(mp.Process):
def __init__(self, reset_tables=False,
number_of_companies=100, i_proc=None,
number_of_procs=1, first_id=None, last_id=None):
if i_proc is not None and number_of_procs > 1:
mp.Process.__init__(self)
self.i_proc = i_proc
self.reset_tables = reset_tables
self.number_of_procs = number_of_procs
self.n_companies = number_of_companies
self.data_df: pd.DataFrame = None
self.first_id = first_id
self.last_id = last_id
# initialise the database and models
self.database = init_database()
self.Company = init_models(self.database, reset_tables=self.reset_tables)
def generate_data(self):
""" Create a dataframe with fake company data and id's and return the array of id's"""
id_list = np.random.randint(1000000, 9999999, self.n_companies)
company_list = np.array([random_name() for _ in range(self.n_companies)])
self.data_df = pd.DataFrame(data=np.vstack([id_list, company_list]).T,
columns=[KVK_KEY, NAME_KEY])
self.data_df.drop_duplicates([KVK_KEY], inplace=True)
self.data_df.sort_values([KVK_KEY], inplace=True)
return self.data_df[KVK_KEY].values
def store_to_database(self):
"""
Store the company data to a sql database
"""
record_list = list(self.data_df.to_dict(orient="index").values())
n_batch = int(len(record_list) / MAX_SQL_CHUNK) + 1
with self.database.atomic():
for cnt, batch in enumerate(pw.chunked(record_list, MAX_SQL_CHUNK)):
print(f"writing {cnt}/{n_batch}")
self.Company.insert_many(batch).execute()
def run(self):
query = (self.Company.
select().
where(self.Company.id_number.between(self.first_id, self.last_id)))
for cnt, company in enumerate(query):
print("Processing # {} - {}: company {}/{}".format(self.i_proc, cnt, company.id_number,
company.name))
number_of_vowels = vowel_count(company.name)
company.n_vowels = number_of_vowels
company.processor = self.i_proc
try:
company.save()
except (pw.OperationalError, pw.InterfaceError) as err:
print("failed save for {} {}: {}".format(self.i_proc, cnt, err))
else:
pass
Finally, the main.py script which launches the processes:
from parser import CompanyParser
import time
def main():
number_of_processors = 2
number_of_companies = 10000
parser = CompanyParser(number_of_companies=number_of_companies, reset_tables=True)
company_ids = parser.generate_data()
parser.store_to_database()
n_companies = company_ids.size
n_comp_per_proc = int(n_companies / number_of_processors)
print("Found {} companies: {} per proc".format(n_companies, n_comp_per_proc))
if not parser.database.is_closed():
parser.database.close()
processes = list()
for i_proc in range(number_of_processors):
i_start = i_proc * n_comp_per_proc
first_id = company_ids[i_start]
last_id = company_ids[i_start + n_comp_per_proc - 1]
print(f"Running proc {i_proc} for id {first_id} until id {last_id}")
sub_parser = CompanyParser(first_id=first_id, last_id=last_id, i_proc=i_proc,
number_of_procs=number_of_processors)
if number_of_processors > 1:
sub_parser.start()
else:
sub_parser.run()
processes.append(sub_parser)
# this blocks the script until all processes are done
for job in processes:
job.join()
# make sure all the connections are closed
for i_proc in range(number_of_processors):
db = processes[i_proc].database
if not db.is_closed():
db.close()
print("Goodbye!")
if __name__ == "__main__":
start = time.time()
main()
duration = time.time() - start
print(f"Done in {duration} s")
As you can see, the database connection is done per process inside the class.
This example works and is a full example of multiprocessing + peewee and PostgreSQL. Hopefully this may help others. In case you have any comments or suggestions for improvement please let me know.
I did get this error too but with flask + peewee + rq in Heroku. Below is how I solved it:
If you have a simple app that you use with RQ, I would suggest to use SimpleWorker
RQ suggest to use rq.worker.HerokuWorker but I still received a ssl error with this.
The error appeared in a case where I have created a follow-up(chain) tasks, where execution of 1 depends on another tasks success.
Also I am using flask-rq2 but applies to normal usage as well as shown below:
# app.py
app = Flask(__name__)
app.config['RQ_WORKER_CLASS'] = os.getenv('RQ_WORKER_CLASS', 'rq.worker.Worker')
rq = RQ(app)
I solved it by changing the following in heroku config:
set your RQ_WORKER_CLASS to rq.worker.SimpleWorker
Am re-writing a program from previous question, and I am having trouble with it. Please see code:
#!/usr/bin/python
import subprocess,time, timeit
from multiprocessing import Process, Queue
import re, os, pprint, math
from collections import defaultdict
Dict = {}
identifier = ""
hexbits = []
count = defaultdict(int)
def __ReadRX__(RX_info):
lines = iter(RX_info.stdout.readline, "")
try:
start = time.clock()
for line in lines:
if re.match(r"^\d+.*$",line):
splitline = line.split()
del splitline[1:4]
identifier = splitline[1]
count[identifier] += 1
end = time.clock()
timing = round((end - start) * 10000, 100)
dlc = splitline[2]
hexbits = splitline[3:]
Dict[identifier] = [dlc, hexbits, count[identifier],int(timing)]
start = end
except keyboardinterrupt:
pass
procRX = subprocess.Popen('receivetest -f=/dev/pcan32'.split(), stdout=subprocess.PIPE)
if __name__ == '__main__':
munchCan = Process(target=__ReadRX__, args=(procRX))
munchCan.start()
munchCan.join()
print Dict
When trying to run the code I get the following error:
File "./cancheck2.py", line 36, in <module>
munchCan = Process(target=__ReadRx__, args=(procRX))
File "/usr/lib/python2.7/multiprocessing/process.py", line 104, in __init__
self._args = tuple(args)
TypeError: 'Popen' objec is not iterable
This code worked before I seperated the subprocess and set up __ReadRX__ as a seperate process.
Coudl anyone explain what is happening as I don't quite understand?
(procRX) doesn't create a tuple, you have to use (procRX,).