python2.7 + multiprocessing + selenium: restart process on exception - python

I seem to be having issues with a python script that uses multiprocessing. What it essentially does is take a list of ID codes, and start processes that use Selenium and PhantomJS as the driver to navigate to a URL containing that ID code, extract data to individual csv files, then compile another csv file once all processes finish. Everything runs great, except sometimes one of the processes will return an exception that says:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "modtest.py", line 11, in worker
do_work(item)
File "/home/mdrouin/Dropbox/Work/Dev/Python/WynInvScrape/items.py", line 14, in do_work
driver = webdriver.PhantomJS()
File "/usr/lib/python2.7/site-packages/selenium/webdriver/phantomjs/webdriver.py", line 50, in __init__
self.service.start()
File "/usr/lib/python2.7/site-packages/selenium/webdriver/phantomjs/service.py", line 72, in start
raise WebDriverException("Can not connect to GhostDriver")
I've tried working in ways to restart the process if an exception is raised, but what appears to be happening, regardless, is once the processes finish, the program hangs and doesn't move on, or do anything for that matter. I would essentially like to restart the ID number that is being searched for if the process crashes, and continue on when all processes are finished. Here is an extremely slimmed down version of the code:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup as bs
import multiprocessing
import datetime, time, csv, glob
num_procs = 8
def do_work(rsrt):
driver = webdriver.PhantomJS()
try:
driver.get('http://www.example.com/get.php?resort=' + rsrt)
rows = []
for row in soup.find_all('tr'):
if row.find('input', {'name': 'booksubmit'}):
wyncheckin = row.find('td', {'class': 'searchAvailDate'}).string
wynnights = row.find('td', {'class': 'searchAvailNights'}).string
wynroom = row.find('td', {'class': 'searchAvailUnitType'}).string
rows.append([wynresort, wyncheckin, wynroom])
driver.quit()
with open('/home/mdrouin/Dropbox/Work/Dev/Python/WynInvScrape/availability/'+rsrt+'.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(row for row in rows if row)
print 'Process ' + rsrt + ' End: ' + str(time.strftime('%c'))
except:
driver.quit()
def worker():
for item in iter( q.get, None ):
do_work(item)
q.task_done()
q.task_done()
q = multiprocessing.JoinableQueue()
procs = []
for i in range(num_procs):
procs.append( multiprocessing.Process(target=worker) )
procs[-1].daemon = True
procs[-1].start()
source = ['0017', '0113', '0020', '0013', '0038', '1028', '0115', '0105', '0041', '0037', '0043', '2026', '0165', '0164',
'0033', '0126', '0116', '0103', '9135', '0185', '0206', '0053', '0062', '1020', '0019', '0042', '2028', '0213',
'0211', '0163', '0073', '2020', '0214', '2140', '0084', '0193', '0095', '0064', '0196', '0028', '0068', '0074']
for item in source:
q.put(item)
q.join()
for p in procs:
q.put( None )
q.join()
for p in procs:
p.join()
print "Finished"
print 'Writting core output: ' + str(time.strftime('%c'))
with open('availability.csv', 'wb') as outfile:
for csvfile in glob.glob('/home/mdrouin/Dropbox/Work/Dev/Python/WynInvScrape/availability/*.csv'):
for line in open(csvfile, 'r'):
outfile.write(line)
print 'Process End: ' + str(time.strftime('%c'))

One of the ways to tackle this sort of problem is with recurrent call to itself, something in lines of:
def do_work(rsrt):
if failed:
return do_work(rsrt)
Of course this will run until it resolves, so you might want to pass a counter and if it's above certain value just return false.

Related

Why does my second python async (scraping) function (which uses results from the first async (scraping) function) return no result?

Summary of what the program should do:
Step 1 (sync): Determine exactly how many pages need to be scraped.
Step 2 (sync): create the links to the pages to be scraped in a for-loop.
Step 3 (async): Use the link list from step 2 to get the links to the desired detail pages from each of these pages.
Step 4 (async): Use the result from step 3 to extract the detail information for each hofladen. This information is stored in a list for each farm store and each of these lists is appended to a global list.
Where do I have the problem?
The transition from step 3 to step 4 does not seem to work properly.
Traceback (most recent call last):
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 108, in <module>
asyncio.run(main())
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 96, in main
await asyncio.gather(*tasks_detail_infos)
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 61, in scrape_detail_infos
data = JsonLdExtractor().extract(body_d)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/jsonld.py", line 21, in extract
tree = parse_html(htmlstring, encoding=encoding)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/utils.py", line 10, in parse_html
return lxml.html.fromstring(html, parser=parser)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 873, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 761, in document_fromstring
raise etree.ParserError(
lxml.etree.ParserError: Document is empty
Process finished with exit code 1
What did I do to isolate the problem?
In a first attempt I rewrote the async function append_detail_infos so that it no longer tries to create a list and append the values but only prints data[0]["name"].
This resulted in the error message
Traceback (most recent call last):
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 108, in <module>
asyncio.run(main())
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 96, in main
await asyncio.gather(*tasks_detail_infos)
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 61, in scrape_detail_infos
data = JsonLdExtractor().extract(body_d)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/jsonld.py", line 21, in extract
tree = parse_html(htmlstring, encoding=encoding)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/utils.py", line 10, in parse_html
return lxml.html.fromstring(html, parser=parser)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 873, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 761, in document_fromstring
raise etree.ParserError(
lxml.etree.ParserError: Document is empty
Process finished with exit code 1
In the next attempt, I exported the links from detail_links as .csv and visually checked them and opened some of them to see if they were valid. This was also the case.
The program code:
import asyncio
import time
import aiohttp
import requests
import re
from selectolax.parser import HTMLParser
from extruct.jsonld import JsonLdExtractor
import pandas as pd
BASE_URL = "https://hofladen.info"
FIRST_PAGE = 1
def get_last_page(url: str) -> int:
res = requests.get(url).text
html = HTMLParser(res)
last_page = int(re.findall("(\d+)", html.css("li.page-last > a")[0].attributes["href"])[0])
return last_page
def build_links_to_pages(start: int, ende: int) -> list:
lst = []
for i in range(start, ende + 1):
url = f"https://hofladen.info/regionale-produkte?page={i}"
lst.append(url)
return lst
async def scrape_detail_links(url: str):
async with aiohttp.ClientSession() as session:
async with session.get(url, allow_redirects=True) as resp:
body = await resp.text()
html = HTMLParser(body)
for node in html.css(".sp13"):
detail_link = BASE_URL + node.attributes["href"]
detail_links.append(detail_link)
async def append_detail_infos(data):
my_detail_lst = []
# print(data[0]["name"]) # name for debugging purpose
my_detail_lst.append(data[0]["name"]) # name
my_detail_lst.append(data[0]["address"]["streetAddress"]) # str
my_detail_lst.append(data[0]["address"]["postalCode"]) # plz
my_detail_lst.append(data[0]["address"]["addressLocality"]) # ort
my_detail_lst.append(data[0]["address"]["addressRegion"]) # bundesland
my_detail_lst.append(data[0]["address"]["addressCountry"]) # land
my_detail_lst.append(data[0]["geo"]["latitude"]) # breitengrad
my_detail_lst.append(data[0]["geo"]["longitude"]) # längengrad
detail_infos.append(my_detail_lst)
async def scrape_detail_infos(detail_link: str):
async with aiohttp.ClientSession() as session_detailinfos:
async with session_detailinfos.get(detail_link) as res_d:
body_d = await res_d.text()
data = JsonLdExtractor().extract(body_d)
await append_detail_infos(data)
async def main() -> None:
start_time = time.perf_counter()
# Beginn individueller code
# ----------
global detail_links, detail_infos
detail_links, detail_infos = [], []
tasks = []
tasks_detail_infos = []
# extrahiere die letzte zu iterierende Seite
last_page = get_last_page("https://hofladen.info/regionale-produkte")
# scrape detail links
links_to_pages = build_links_to_pages(FIRST_PAGE, last_page)
for link in links_to_pages:
task = asyncio.create_task(scrape_detail_links(link))
tasks.append(task)
print("Saving the output of extracted information.")
await asyncio.gather(*tasks)
pd.DataFrame(data=detail_links).to_csv("detail_links.csv")
# scrape detail infos
for detail_url in detail_links:
task_detail_infos = asyncio.create_task(scrape_detail_infos(detail_url))
tasks_detail_infos.append(task_detail_infos)
await asyncio.gather(*tasks_detail_infos)
# Ende individueller Code
# ------------
time_difference = time.perf_counter() - start_time
print(f"Scraping time: {time_difference} seconds.")
print(len(detail_links))
# print(detail_infos[])
asyncio.run(main())
A working solution to the problem:
added python allow_redirects=True to python async with session_detailinfos.get(detail_link, allow_redirects=True) as res_d:
added python return_exceptions=True to python await asyncio.gather(*tasks_detail_infos, return_exceptions=True)
A working solution to the problem:
added
python allow_redirects=True to python async with session_detailinfos.get(detail_link, allow_redirects=True) as res_d:
added python return_exceptions=True to python await asyncio.gather(*tasks_detail_infos, return_exceptions=True)

Semaphore stuck in python code, whats wrong?

I am trying to copy some files to an OCI bucket (Oracle Cloud Infrastructure).
The fist 5 files are succefully copied, but then the script hangs and the processes on the task manager dies, remaining only the main one.
from array import array
from pathlib import Path
import oci
import datetime
from multiprocessing import Process
import threading
import logging
from oci.object_storage import UploadManager
from oci.object_storage.models import CreateBucketDetails
from oci.object_storage.transfer.constants import MEBIBYTE
logging.basicConfig(filename=r'############',filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
# Number of max processes allowed at a time
concurrency= 5
sema = threading.BoundedSemaphore(concurrency)
# The root directory path, Replace with your path
p = Path(r"#####")
# The Compartment OCID
compartment_id = "#######"
# The Bucket name where we will upload
bucket_name = "######"
config = oci.config.from_file()
object_storage_client = oci.object_storage.ObjectStorageClient(config)
part_size = 2 * MEBIBYTE
today = datetime.date.today()
today = str(today)
def upload_to_object_storage(path:str,name:str,namespace):
#upload_manager = UploadManager(object_storage_client, allow_parallel_uploads=False)
with open(path, "rb") as in_file:
logging.info("Starting upload {}".format(name))
object_storage_client.put_object(namespace,bucket_name,name,in_file)
#upload_manager.upload_file(namespace, bucket_name, name, in_file.name, part_size=part_size)
logging.info("Finished uploading {}".format(name))
sema.release()
return
def createUploadProcess(object:Path,object_storage_client,namespace,proc_list):
name = object.relative_to(p).as_posix()
sema.acquire()
process = Process(target=upload_to_object_storage, args=(object.as_posix(),name,namespace))
proc_list.append(process)
process.start()
def processDirectoryObjects(object:Path,object_storage_client,namespace,proc_list):
if object.is_file():
createUploadProcess(object,object_storage_client,namespace,proc_list)
def processDirectory(path:Path,object_storage_client,namespace,proc_list):
if path.exists():
logging.info("in directory ---- " + path.relative_to(p).as_posix())
for objects in path.iterdir():
if objects.is_dir():
processDirectory(objects,object_storage_client,namespace,proc_list)
else:
if today in objects.name:
processDirectoryObjects(objects,object_storage_client,namespace,proc_list)
if __name__ == '__main__':
config = config
object_storage_client = object_storage_client
sema = sema
namespace = object_storage_client.get_namespace().data
proc_list: array = []
if p.exists() and p.is_dir():
processDirectory(p,object_storage_client,namespace,proc_list)
for job in proc_list:
job.join()
I have aproximaly 50 files to copy, but it uploads 5 and then hangs. The execution presents the following error for the 5 processes:
Process Process-1:
Traceback (most recent call last):
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 258, in _bootstrap
self.run()
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\#######\Documents\copia_bkp_oci2.py", line 49, in upload_to_object_storage
sema.release()
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\threading.py", line 482, in release
raise ValueError("Semaphore released too many times")
ValueError: Semaphore released too many times

Cycle an iterator using multiprocessing in Python

I have an iterator that will retrive various number of lines from a very large (>20GB) file depend on some features. The iterator works fine, but I can only use 1 thread to process the result. I would like to feed the value from each iteration to multiple threads / processes.
I'm using a text file with 9 lines to mimic my data, here is my code. I've been struggling on how to create the feedback so when one process finished, it will go and retrive the next iteration:
from multiprocessing import Process, Manager
import time
# Iterator
class read_file(object):
def __init__(self, filePath):
self.file = open(filePath, 'r')
def __iter__(self):
return self
def __next__(self):
line = self.file.readline()
if line:
return line
else:
raise StopIteration
# worker for one process
def print_worker(a, n, stat):
print(a)
stat[n] = True # Set the finished status as True
return None
# main
def main():
file_path = 'tst_mp.txt' # the txt file wit 9 lines
n_worker = 2
file_handle = read_file(file_path)
workers = []
# Create shared list for store dereplicated dict and progress counter
manager = Manager()
status = manager.list([False] * 2) # list of dictonary for each thread
# Initiate the workers
for i in range(n_worker):
workers.append(Process(target=print_worker, args=(file_handle.__next__(), i, status,)))
for worker in workers:
worker.start()
block = file_handle.__next__() # The next block (line)
while block: # continue is there is still block left
print(status)
time.sleep(1) # for every second
for i in range(2):
if status[i]: # Worker i finished
workers[i].join()
# workers[i].close()
workers[i] = Process(target=print_worker, args=(block, i, status,))
status[i] = False # Set worker i as busy (False)
workers[i].start() # Start worker i
try: # try to get the next item in the iterator
block = file_handle.__next__()
except StopIteration:
block = False
if __name__ == '__main__':
main()
The code is clumsy, but it did print out the sequence, but also with some error when I ran the code twice:
1
2
3
4
5
6
7
8
9
Process Process-10:
Traceback (most recent call last):
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
conn = self._tls.connection
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/zewei/share/paf_depth/test_multiprocess.py", line 31, in print_worker
stat[n] = True # Set the finished status as True
File "<string>", line 2, in __setitem__
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
self._connect()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 794, in _connect
dispatch(conn, None, 'accept_connection', (name,))
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 90, in dispatch
kind, result = c.recv()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 255, in recv
buf = self._recv_bytes()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
buf = self._recv(4)
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 384, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Here is where I'm stucked, I was wondering if there is any fix or more elegant way for this?
Thanks!
Here's a better way to do what you are doing, using pool:
from multiprocessing import Pool
import time
.
.
.
.
# worker for one process
def print_worker(a):
print(a)
return None
def main():
file_path = r'' # the txt file wit 9 lines
n_worker = 2
file_handle = read_file(file_path)
results = []
with Pool(n_worker) as pool:
for result in pool.imap(print_worker, file_handle):
results.append(result)
print(results)
if __name__ == '__main__':
main()
Here, the imap function lazily iterates over the iterator, so that the whole file won't be read into memory. Pool handles spreading the tasks across the number of processes you started (using n_worker) automatically so that you don't have to manage it yourself.

Multiprocessing using partial() throws ForkingPickler error

I am trying to crawl abstracts from PubMed and filtering them using regex via python. To speed things up, I wanted to use pythons multiprocessing pool.
My code looks like the following:
import multiprocessing as mp
from functools import partial
from typing import List, Tuple
def beautify_abstract(abstract: str, regex: str):
import re
result: str = ""
last_start = 0
matches = re.finditer(regex, abstract, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result += abstract[last_start:match.start()]
result += "<b>"
result += abstract[match.start():match.end()]
result += "</b>"
last_start = match.end()
result += abstract[last_start:]
return result
def get_doi(pim: str, regex: str):
from Bio import Entrez
from Bio.Entrez import efetch
import re
from metapub.convert import pmid2doi
Entrez.email = "Your.Name.Here#example.org"
print(f"Processing {pim}")
abstract_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='all')
abstract = abstract_handle.read()
abstract_handle.close()
if re.search(regex, abstract, re.MULTILINE) is not None:
docsum_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='docsum').read()
docsum = docsum_handle.read()
try:
doi = pmid2doi(pim)
except:
doi = "UNKNOWN"
return f"{doi}"
return ""
def get_pim_with_regex_list(keywords: List[str]) -> List[str]:
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
searchterm = " ".join(keywords)
pims = []
handle = Entrez.esearch(db="pubmed", retstart=0, retmax=0, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
count = int(record['Count'])
if count > 100000:
retmax = 100000
else:
retmax = count
retstart = 0
while retstart < count:
handle = Entrez.esearch(db="pubmed", retstart=retstart, retmax=retmax, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
for pim in record['IdList']:
pims.append(pim)
retstart += retmax
return pims
if __name__ == '__main__':
keywords = ["keyword1", "keyword2"]
pim_list = get_pim_with_regex_list(keywords)
regex = "keyword1 keyword2"
worker_fn = partial(get_doi, regex=regex)
pool = mp.Pool(mp.cpu_count())
entries = pool.map(worker_fn, pim_list)
pool.close()
pool.join()
When I run the given code, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
Process ForkPoolWorker-4:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
I did some digging into multiprocessing with python and found out that only python native types are supported as parameters (enforced by the ForkingPickler).
Assuming that str is a native type, the code should work... Currently, I am completely lost and have no idea what may be the problem.
As suggested, I uploaded a minimal (sequential) working example here
Is there any way to fix this problem or at least diagnose the real issue here?

Multiprocessing with text scraping

I want to scrape <p> from pages and since there will be a couple thousands of them I want to use multiprocessing. However, it doesn't work when I try to append the result to some variable
I want to append the result of scraping to the data = []
I made a url_common for a base website since some pages don't start with HTTP etc.
from tqdm import tqdm
import faster_than_requests as requests #20% faster on average in my case than urllib.request
import bs4 as bs
def scrape(link, data):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
data.append(p.text)
Above doesn't work, since map() doesn't accept function like above
I tried to use it another way:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
from multiprocessing import Pool
p = Pool(10)
links = ['link', 'other_link', 'another_link']
data = p.map(scrape, links)
I get this error while using above function:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 110, in worker
task = get()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\queues.py", line 354, in get
return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scrape' on <module '__main__' (built-in)>
I have not figured a way to do it so that it uses Pool and at the same time appending the result of scraping to the given variable
EDIT
I change a little bit to see where it stops:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.investing.com/'
else:
url_common = ''
try: #tries are always halpful with url as you never know
ht = requests.get2str(url_common + str(i))
except:
pass
print('works1')
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
print('works2')
for p in paragraphs:
print(p.text)
links = ['link', 'other_link', 'another_link']
scrape(links)
#WORKS PROPERLY AND PRINTS EVERYTHING
if __name__ == '__main__':
p = Pool(5)
print(p.map(scrape, links))
#DOESN'T WORK, NOTHING PRINTS. Error like above
You are using the map function incorrectly.
It iterates over each element of the iterable and calls the function on each element.
You can see the map function as doing something like the following:
to_be_mapped = [1, 2, 3]
mapped = []
def mapping(x): # <-- note that the mapping accepts a single value
return x**2
for item in to_be_mapped:
res = mapping(item)
mapped.append(res)
So to solve your problem remove the outermost for-loop as iterating is handled by the map function
def scrape(link):
if link[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(link))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)

Categories

Resources