Multiprocessing with text scraping - python

I want to scrape <p> from pages and since there will be a couple thousands of them I want to use multiprocessing. However, it doesn't work when I try to append the result to some variable
I want to append the result of scraping to the data = []
I made a url_common for a base website since some pages don't start with HTTP etc.
from tqdm import tqdm
import faster_than_requests as requests #20% faster on average in my case than urllib.request
import bs4 as bs
def scrape(link, data):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
data.append(p.text)
Above doesn't work, since map() doesn't accept function like above
I tried to use it another way:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
from multiprocessing import Pool
p = Pool(10)
links = ['link', 'other_link', 'another_link']
data = p.map(scrape, links)
I get this error while using above function:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 110, in worker
task = get()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\queues.py", line 354, in get
return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scrape' on <module '__main__' (built-in)>
I have not figured a way to do it so that it uses Pool and at the same time appending the result of scraping to the given variable
EDIT
I change a little bit to see where it stops:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.investing.com/'
else:
url_common = ''
try: #tries are always halpful with url as you never know
ht = requests.get2str(url_common + str(i))
except:
pass
print('works1')
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
print('works2')
for p in paragraphs:
print(p.text)
links = ['link', 'other_link', 'another_link']
scrape(links)
#WORKS PROPERLY AND PRINTS EVERYTHING
if __name__ == '__main__':
p = Pool(5)
print(p.map(scrape, links))
#DOESN'T WORK, NOTHING PRINTS. Error like above

You are using the map function incorrectly.
It iterates over each element of the iterable and calls the function on each element.
You can see the map function as doing something like the following:
to_be_mapped = [1, 2, 3]
mapped = []
def mapping(x): # <-- note that the mapping accepts a single value
return x**2
for item in to_be_mapped:
res = mapping(item)
mapped.append(res)
So to solve your problem remove the outermost for-loop as iterating is handled by the map function
def scrape(link):
if link[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(link))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)

Related

Multiprocessing using partial() throws ForkingPickler error

I am trying to crawl abstracts from PubMed and filtering them using regex via python. To speed things up, I wanted to use pythons multiprocessing pool.
My code looks like the following:
import multiprocessing as mp
from functools import partial
from typing import List, Tuple
def beautify_abstract(abstract: str, regex: str):
import re
result: str = ""
last_start = 0
matches = re.finditer(regex, abstract, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result += abstract[last_start:match.start()]
result += "<b>"
result += abstract[match.start():match.end()]
result += "</b>"
last_start = match.end()
result += abstract[last_start:]
return result
def get_doi(pim: str, regex: str):
from Bio import Entrez
from Bio.Entrez import efetch
import re
from metapub.convert import pmid2doi
Entrez.email = "Your.Name.Here#example.org"
print(f"Processing {pim}")
abstract_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='all')
abstract = abstract_handle.read()
abstract_handle.close()
if re.search(regex, abstract, re.MULTILINE) is not None:
docsum_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='docsum').read()
docsum = docsum_handle.read()
try:
doi = pmid2doi(pim)
except:
doi = "UNKNOWN"
return f"{doi}"
return ""
def get_pim_with_regex_list(keywords: List[str]) -> List[str]:
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
searchterm = " ".join(keywords)
pims = []
handle = Entrez.esearch(db="pubmed", retstart=0, retmax=0, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
count = int(record['Count'])
if count > 100000:
retmax = 100000
else:
retmax = count
retstart = 0
while retstart < count:
handle = Entrez.esearch(db="pubmed", retstart=retstart, retmax=retmax, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
for pim in record['IdList']:
pims.append(pim)
retstart += retmax
return pims
if __name__ == '__main__':
keywords = ["keyword1", "keyword2"]
pim_list = get_pim_with_regex_list(keywords)
regex = "keyword1 keyword2"
worker_fn = partial(get_doi, regex=regex)
pool = mp.Pool(mp.cpu_count())
entries = pool.map(worker_fn, pim_list)
pool.close()
pool.join()
When I run the given code, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
Process ForkPoolWorker-4:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
I did some digging into multiprocessing with python and found out that only python native types are supported as parameters (enforced by the ForkingPickler).
Assuming that str is a native type, the code should work... Currently, I am completely lost and have no idea what may be the problem.
As suggested, I uploaded a minimal (sequential) working example here
Is there any way to fix this problem or at least diagnose the real issue here?

Weird error with pool module and beautiful soup: Invalid URL 'h'

I am scraping a very large website with Beautiful Soup for a project and want to use the Pool module to speed it up. I am getting a weird error where it is not correctly reading the list of URL's, as far as I can tell it is just grabbing the first 'h'.
The entire code works perfectly if I do not use pool. The list of URL's is read properly. I am not sure if there is something weird about how you have to prepare the URL's when calling p.map(scrapeClauses, links) because if I simply call scrapeClauses(links) everything works.
Here is my main function:
if __name__ == '__main__':
links = list()
og = 'https://www.lawinsider.com'
halflink = '/clause/limitation-of-liability'
link = og + halflink
links.append(link)
i = 0
while i < 50:
try:
nextLink = generateNextLink(link)
links.append(nextLink)
link = nextLink
i += 1
except:
print('Only ', i, 'links found')
i = 50
start_time = time.time()
print(links[0])
p = Pool(5)
p.map(scrapeClauses, links)
p.terminate()
p.join()
#scrapeClauses(links)
and here is scrapeClauses():
def scrapeClauses(links):
#header to avoid site detecting scraper
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
#list of clauses
allText = []
number = 0
for line in links:
page_link = line
print(page_link)
page_response = requests.get(page_link, headers=headers)
html_soup = BeautifulSoup(page_response.content, "html.parser")
assignments = html_soup.find_all('div', class_ ='snippet-content')
for i in range(len(assignments)):
assignments[i] = assignments[i].get_text()
#option to remove te assignment that precedes each clause
#assignments[i] = assignments[i].replace('Assignment.','',1)
allText.append(assignments[i])
#change the index of the name of the word doc
name = 'limitationOfLiability' + str(number) + '.docx'
#some clauses have special characters tat produce an error
try:
document = Document()
stuff = assignments[i]
document.add_paragraph(stuff)
document.save(name)
number += 1
except:
continue
I did not include generateNextLink() to save space and because I am pretty sure the error is not in there but if someone thinks it is I will provide it.
As you can see I 'print(page_link) in scrapeClauses. If I am not using pool, it will print all the normal links. But if I use pool, a bunch of h's print out line after line. I then get and error that h is not a valid URL. I will show the error code below.
https://www.lawinsider.com/clause/limitation-of-liability
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\multiproce
ssing\pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\multiproce
ssing\pool.py", line 44, in mapstar
return list(map(*args))
File "C:\Users\wquinn\Web Scraping\assignmentBSScraper.py", line 20, in scrape
Clauses
page_response = requests.get(page_link, headers=headers)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps
you meant http://h?
The second argument of p.map get an list. Each such element will be sent to a function. So you function got a string and not a list of string as you expect.
The minimal example is:
from multiprocessing import Pool
def f(str_list):
for x in str_list:
print ('hello {}'.format(x))
if __name__ == '__main__':
str_list = ['111', '2', '33']
p = Pool(5)
p.map(f, str_list)
p.terminate()
p.join()
Output is:
hello 1
hello 1
hello 1
hello 2
hello 3
hello 3

python crawler ieee paper keywords

i trying to use crawler to get ieee paper keywords but now i get a error
how can to fix my crawler?
my code is here
import requests
import json
from bs4 import BeautifulSoup
ieee_content = requests.get("http://ieeexplore.ieee.org/document/8465981", timeout=180)
soup = BeautifulSoup(ieee_content.text, 'xml')
tag = soup.find_all('script')
for i in tag[9]:
s = json.loads(re.findall('global.document.metadata=(.*;)', i)[0].replace("'", '"').replace(";", ''))
and error is here
Traceback (most recent call last):
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 90, in <module>
a.get_es_data(offset=0, size=1)
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 53, in get_es_data
self.get_data(link=ieee_link, esid=es_id)
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 65, in get_data
s = json.loads(re.findall('global.document.metadata=(.*;)', i)[0].replace(";", '').replace("'", '"'))
IndexError: list index out of range
Here's another answer. I don't know what you are doing with 's' in your code after the load (replace) in my code.
The code below doesn't thrown an error, but again how are you using 's'
import requests
import json
from bs4 import BeautifulSoup
ieee_content = requests.get("http://ieeexplore.ieee.org/document/8465981", timeout=180)
soup = BeautifulSoup(ieee_content.text, 'xml')
tag = soup.find_all('script')
# i is a list
for i in tag[9]:
metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE)
metadata = re.findall(metadata_format, i)
if len(metadata) != 0:
# convert the list
convert_to_json = json.dumps(metadata)
x = json.loads(convert_to_json)
s = x[0].replace("'", '"').replace(";", '')
###########################################
# I don't know what you plan to do with 's'
###########################################
print (s)
Apparently in line 65 some of the data provided in i did not suite the regex pattern you're trying to use. Therefor your [0] will not work as the data returned is not an array of suitable length.
Solution:
x = json.loads(re.findall('global.document.metadata=(.*;)', i)
if x:
s = x[0].replace("'", '"').replace(";", ''))

TypeError: 'NoneType' object is not iterable: Webcrawler to scrape email addresses

I am trying to get the below program working. It is supposed to find email addresses in a website but, it is breaking. I suspect the problem is with initializing result = [] inside the crawl function. Below is the code:
# -*- coding: utf-8 -*-
import requests
import re
import urlparse
# In this example we're trying to collect e-mail addresses from a website
# Basic e-mail regexp:
# letter/number/dot/comma # letter/number/dot/comma . letter/number
email_re = re.compile(r'([\w\.,]+#[\w\.,]+\.\w+)')
# HTML <a> regexp
# Matches href="" attribute
link_re = re.compile(r'href="(.*?)"')
def crawl(url, maxlevel):
result = []
# Limit the recursion, we're not downloading the whole Internet
if(maxlevel == 0):
return
# Get the webpage
req = requests.get(url)
# Check if successful
if(req.status_code != 200):
return []
# Find and follow all the links
links = link_re.findall(req.text)
for link in links:
# Get an absolute URL for a link
link = urlparse.urljoin(url, link)
result += crawl(link, maxlevel - 1)
# Find all emails on current page
result += email_re.findall(req.text)
return result
emails = crawl('http://ccs.neu.edu', 2)
print "Scrapped e-mail addresses:"
for e in emails:
print e
The error I get is below:
C:\Python27\python.exe "C:/Users/Sagar Shah/PycharmProjects/crawler/webcrawler.py"
Traceback (most recent call last):
File "C:/Users/Sagar Shah/PycharmProjects/crawler/webcrawler.py", line 41, in <module>
emails = crawl('http://ccs.neu.edu', 2)
File "C:/Users/Sagar Shah/PycharmProjects/crawler/webcrawler.py", line 35, in crawl
result += crawl(link, maxlevel - 1)
File "C:/Users/Sagar Shah/PycharmProjects/crawler/webcrawler.py", line 35, in crawl
result += crawl(link, maxlevel - 1)
TypeError: 'NoneType' object is not iterable
Process finished with exit code 1
Any suggestions will help. Thanks!
The problem is this:
if(maxlevel == 0):
return
Currently it return None when maxlevel == 0. You can't concatenate a list with a None object.
You need to return an empty list [] to be consistent.

TypeError: Popen not iterable

Am re-writing a program from previous question, and I am having trouble with it. Please see code:
#!/usr/bin/python
import subprocess,time, timeit
from multiprocessing import Process, Queue
import re, os, pprint, math
from collections import defaultdict
Dict = {}
identifier = ""
hexbits = []
count = defaultdict(int)
def __ReadRX__(RX_info):
lines = iter(RX_info.stdout.readline, "")
try:
start = time.clock()
for line in lines:
if re.match(r"^\d+.*$",line):
splitline = line.split()
del splitline[1:4]
identifier = splitline[1]
count[identifier] += 1
end = time.clock()
timing = round((end - start) * 10000, 100)
dlc = splitline[2]
hexbits = splitline[3:]
Dict[identifier] = [dlc, hexbits, count[identifier],int(timing)]
start = end
except keyboardinterrupt:
pass
procRX = subprocess.Popen('receivetest -f=/dev/pcan32'.split(), stdout=subprocess.PIPE)
if __name__ == '__main__':
munchCan = Process(target=__ReadRX__, args=(procRX))
munchCan.start()
munchCan.join()
print Dict
When trying to run the code I get the following error:
File "./cancheck2.py", line 36, in <module>
munchCan = Process(target=__ReadRx__, args=(procRX))
File "/usr/lib/python2.7/multiprocessing/process.py", line 104, in __init__
self._args = tuple(args)
TypeError: 'Popen' objec is not iterable
This code worked before I seperated the subprocess and set up __ReadRX__ as a seperate process.
Coudl anyone explain what is happening as I don't quite understand?
(procRX) doesn't create a tuple, you have to use (procRX,).

Categories

Resources