Failed to parse content from a webpage using requests - python

I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.
Here goes the manual steps to reach the content:
Choose the first item from dropdown.
Get the links to the detail page.
Grab these two fields from detail page.
While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.
Script without session:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def grab_first_link_from_dropdown(link):
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link,headers=headers)
str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
soup = BeautifulSoup(res.text,"html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield str_cookie,inner_link
def get_content(str_cookie,inner_link):
headers['Cookie'] = str_cookie
res = requests.get(inner_link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: descripcion = ""
return expediente,descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie,detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie,detail_page_link))
What possible change should I bring about to make the script work?

There's a redirect that occurs on fetch_detail_page_link. Python Requests follows redirects by default. When your script obtains the cookies, it is only grabbing the cookies for the final request in the chain. You must access the history field of the response to see the redirects that were followed. Doing this with a Session object worked because it was preserving those cookies for you.
I must agree with others who have commented that it really would be a good idea to use a Session object for this. However if you insist on not using Session, your script would look like this:
import re
import requests
from requests.cookies import RequestsCookieJar
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': "Scraping Your Vigentes 1.0",
}
def grab_first_link_from_dropdown(link):
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
category_link = urljoin(base, soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link, headers=headers)
cookies = RequestsCookieJar() # create empty cookie jar
for r in res.history:
cookies.update(r.cookies) # merge in cookies from each redirect response
cookies.update(res.cookies) # merge in cookies from the final response
soup = BeautifulSoup(res.text, "html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'", target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield cookies, inner_link
def get_content(cookies, inner_link):
res = requests.get(inner_link, headers=headers, cookies=cookies)
if not res.ok:
print("Got bad response %s :(" % res.status_code)
return "", ""
soup = BeautifulSoup(res.text, "html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
descripcion = ""
return expediente, descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie, detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie, detail_page_link))

Related

How can dynamic css selectors be used with Beautiful Soup?

Following code scrapes data from specific selectors for the sites in siteUrlArray. It works fine.
However this requires me to write one function for each website - just to define the selectors. I try to dynamically build the
soup.find and soup.select using exec and a dict to hold selector variables - but I can't get it to work.
Working Code
from bs4 import BeautifulSoup
import requests
import sys
import tldextract
def processmarketbeat(soup):
try:
mbtSel = soup.findAll("div", {"id": "cphPrimaryContent_tabAnalystRatings"})[0].find("table")
print(mbtSel)
except Exception as e:
print(str(e))
def processwsj(soup):
try:
wsjSel = soup.select('.at8-col4 > .zonedModule')[0]
print(wsjSel)
except Exception as e:
print(str(e))
global siteUrl, domain, header, parser
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',}
parser = 'html.parser'
siteUrlArray = ['http://www.marketbeat.com/stocks/NASDAQ/GOOGL/price-target',
'http://www.wsj.com/market-data/quotes/GOOGL/research-ratings',
'http://www.marketbeat.com/stocks/NASDAQ/SQ/price-target',
'http://www.wsj.com/market-data/quotes/SQ/research-ratings']
for i in range(len(siteUrlArray)):
siteUrl = siteUrlArray[i]
parsedUrl = tldextract.extract(siteUrl)
domain = parsedUrl.domain
r = requests.get(siteUrl, headers=header, verify=False)
soup = BeautifulSoup(r.text, parser)
getattr(sys.modules[__name__], "process%s" % domain)(soup)
Attempt at using dynamic selector
stockDict = {
'marketbeat': '"""x = soup.findAll("div", {"id": "cphPrimaryContent_tabAnalystRatings"})[0].find("table")"""',
'wsj': '"""x = soup.select(".at8-col4 > .zonedModule")[0]"""'
}
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',}
parser = 'html.parser'
siteUrlArray = ['http://www.marketbeat.com/stocks/NASDAQ/GOOGL/price-target',
'http://www.wsj.com/market-data/quotes/GOOGL/research-ratings',
'http://www.marketbeat.com/stocks/NASDAQ/SQ/price-target',
'http://www.wsj.com/market-data/quotes/SQ/research-ratings']
for i in range(len(siteUrlArray)):
siteUrl = siteUrlArray[i]
parsedUrl = tldextract.extract(siteUrl)
domain = parsedUrl.domain
r = requests.get(siteUrl, headers=header, verify=False)
soup = BeautifulSoup(r.text, parser)
selector = stockDict.get(domain)
exec(selector)
# I want the EXEC to run the equivalent of
# x = soup.findAll("div", {"id": "cphPrimaryContent_tabAnalystRatings"})[0].find("table")
# so that I can print the tags as print(x)
print(x)
But the x is printing as None instead of HTML code of the selected objects.
I was able to achieve what I intended to do with the following code:
selectorDict = {
'marketbeat': 'x = soup.findAll("div", {"id": "cphPrimaryContent_tabAnalystRatings"})[0].find("table")\nprint(x)',
'wsj': 'x = soup.select(".at8-col4 > .zonedModule")[0]\nprint(x)'
}
for i in range(len(siteUrlArray)):
siteUrl = siteUrlArray[i]
print(siteUrl)
parsedUrl = tldextract.extract(siteUrl)
domain = parsedUrl.domain
r = requests.get(siteUrl, headers=header, verify=False)
soup = BeautifulSoup(r.text, parser)
selector = selectorDict.get(domain)
try:
exec(selector)
except Exception as e:
print(str(e))

Data Multithread Parsing

I'm looking for an opportunity to optimize my Python code for a multithreaded Python application.
My code works in the following way: it downloads sitemap, gathering all links from it into map_links. After that, the Parser function checks on every link found and gathers data on the tag I need.
import threading
import requests
from bs4 import BeautifulSoup as bs
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
base_url = 'https://example.com/page.php'
sitemap_url = 'https://example.com/sitemap.xml' #https://exgfmeetworld.xyz/sitemap.xml
# ф-ция парсинга карты сайта
def sitemap(sitemap_url,headers):
map_links =[]
session = requests.Session()
request =session.get(sitemap_url,headers=headers)
if request.status_code == 200:
soup=bs(request.content, 'xml')
for links in soup.find_all('loc'):
map_links.append(links.text)
return map_links
# главная ф-ция парсинга
def parser(base_url,headers):
session = requests.Session()
request =session.get(base_url,headers=headers)
if request.status_code == 200:
soup=bs(request.content, 'html.parser')
#keyword = soup.find_all('h1', attrs={'class':'vedaky'})
keyword = soup.select('h1')[0].get_text()
else:
print ('error')
pass
return keyword
# главная функция парсинга
def main():
all_links=sitemap (sitemap_url,headers)
for i in all_links:
keyword_pars = parser(i,headers)
print (keyword_pars)
if _name_ == '__main__':
main()
I have tried "multiprocessing import Pool" but it doesn't work for my purpose. I need non-pool decisions because I need much higher performance from the script. I'm planning to use it in more than 20 threads.
I can't test it without the proper links but I think this will do what you want it to. It works by passing a list to the parser function, which is of course passed by reference, then the output is 'saved' to an index of the list.
Note that I haven't added any much needed error handling.
import threading
import requests
from bs4 import BeautifulSoup as bs
from multiprocessing.dummy import Pool as ThreadPool
SITE_MAP_URL = 'https://exgfmeetworld.xyz/sitemap.xml'
BASE_URL = 'https://example.com/page.php'
HEADERS = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
def get_site_map() -> list:
request = requests.get(SITE_MAP_URL, headers=HEADERS, timeout=5)
links = []
if request.status_code == 200:
soup = bs(request.content, "html.parser")
links = [l.text for l in soup.find_all("loc")]
return links
def parser(link: str):
request = requests.get(link, headers=HEADERS, timeout=5)
if request.status_code == 200:
soup = bs(request.content, "html.parser")
return soup.find("h1").text
return None
# - MAIN
links = get_site_map()
parser_output = []
pool = ThreadPool(20)
results = pool.map(parser, links)
pool.close()
pool.join()
print(results)

Beatifulsoup not returning full html of the page

I want to scrape few pages from amazon website like title,url,aisn and i run into a problem that script only parsing 15 products while on the page it is showing 50. i decided to print out all html to console and i saw that the html is ending at 15 products without any errors from the script.
Here is the part of my script
keyword = "men jeans".replace(' ', '+')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5'}
url = "https://www.amazon.com/s/field-keywords={}".format(keyword)
request = requests.session()
req = request.get(url, headers = headers)
sleep(3)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup)
It's because few of the items are generated dynamically. There might be any better solution other than using selenium. However, as a workaround you can try the below way instead.
from selenium import webdriver
from bs4 import BeautifulSoup
def fetch_item(driver,keyword):
driver.get(url.format(keyword.replace(" ", "+")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
for items in soup.select("[id^='result_']"):
try:
name = items.select_one("h2").text
except AttributeError: name = ""
print(name)
if __name__ == '__main__':
url = "https://www.amazon.com/s/field-keywords={}"
driver = webdriver.Chrome()
try:
fetch_item(driver,"men jeans")
finally:
driver.quit()
Upon running the above script you should get 56 names or something as result.
import requests
from bs4 import BeautifulSoup
for page in range(1, 21):
keyword = "red car".replace(' ', '+')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5'}
url = "https://www.amazon.com/s/field-keywords=" + keyword + "?page=" + str(page)
request = requests.session()
req = request.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
results = soup.findAll("li", {"class": "s-result-item"})
for i in results:
try:
print(i.find("h2", {"class": "s-access-title"}).text.replace('[SPONSORED]', ''))
print(i.find("span", {"class": "sx-price-large"}).text.replace("\n", ' '))
print('*' * 20)
except:
pass
Amazon's page range is max till 20 here is it crawling the pages

requests failes to keep logged in session

I am trying to scrape some emails from mdpi.com, emails available only to logged in users. But it fails when I am trying to do so. I am getting
when logged out:
Code itself:
import requests
from bs4 import BeautifulSoup
import traceback
login_data = {'form[email]': 'xxxxxxx#gmail.com', 'form[password]': 'xxxxxxxxx', 'remember': 1,}
base_url = 'http://www.mdpi.com'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0'}
session = requests.Session()
session.headers = headers
# log_in
s = session.post('https://susy.mdpi.com/user/login', data=login_data)
print(s.text)
print(session.cookies)
def make_soup(url):
try:
r = session.get(url)
soup = BeautifulSoup(r.content, 'lxml')
return soup
except:
traceback.print_exc()
return None
example_link = 'http://www.mdpi.com/search?journal=medsci&year_from=1996&year_to=2017&page_count=200&sort=relevance&view=default'
def article_finder(soup):
one_page_articles_divs = soup.find_all('div', class_='article-content')
for article_div in one_page_articles_divs:
a_link = article_div.find('a', class_='title-link')
link = base_url + a_link.get('href')
print(link)
article_soup = make_soup(link)
grab_author_info(article_soup)
def grab_author_info(article_soup):
# title of the article
article_title = article_soup.find('h1', class_="title").text
print(article_title)
# affiliation
affiliations_div = article_soup.find('div', class_='art-affiliations')
affiliation_dict = {}
aff_indexes = affiliations_div.find_all('div', class_='affiliation-item')
aff_values = affiliations_div.find_all('div', class_='affiliation-name')
for i, index in enumerate(aff_indexes): # 0, 1
affiliation_dict[int(index.text)] = aff_values[i].text
# authors names
authors_div = article_soup.find('div', class_='art-authors')
authors_spans = authors_div.find_all('span', class_='inlineblock')
for span in authors_spans:
name_and_email = span.find_all('a') # name and email
name = name_and_email[0].text
# email
email = name_and_email[1].get('href')[7:]
# affiliation_index
affiliation_index = span.find('sup').text
indexes = set()
if len(affiliation_index) > 2:
for i in affiliation_index.strip():
try:
ind = int(i)
indexes.add(ind)
except ValueError:
pass
print(name)
for index in indexes:
print('affiliation =>', affiliation_dict[index])
print('email: {}'.format(email))
if __name__ == '__main__':
article_finder(make_soup(example_link))
What should I do in order to get what I want?
Ah that is easy, you haven't managed to log in correctly. If you look at the response from your initial call you will see that you are returned the login page HTML instead of the my profile page. The reason for this is that you are not submitted the hidden token on the form.
The solution request the login page, and then use either lxml or BeautifulSoup to parse the hidden input 'form[_token]'. Get that value and then add it to your login_data payload.
Then submit your login request and you'll be in.

Getting a list of Urls and then finding specific text from all of them in Python 3.5.1

So I have this code that will give me the urls I need in a list format
import requests
from bs4 import BeautifulSoup
offset = 0
links = []
with requests.Session() as session:
while True:
r = session.get("http://rayleighev.deviantart.com/gallery/44021661/Reddit?offset=%d" % offset)
soup = BeautifulSoup(r.content, "html.parser")
new_links = soup.find_all("a", {'class' : "thumb"})
# no more links - break the loop
if not new_links:
break
# denotes the number of gallery pages gone through at one time (# of pages times 24 equals the number below)
links.extend(new_links)
print(len(links))
offset += 24
#denotes the number of gallery pages(# of pages times 24 equals the number below)
if offset == 48:
break
for link in links:
print(link.get("href"))
After that I try to get different text from all of the urls, and all that text is in relatively the same place on each one. But, whenever I run the second half, below, I keep getting a chunk of html text and some errors, and I'm not sure of how to fix it or if there is any other, and preferably simpler, way to get the text from each url.
import urllib.request
import re
for link in links:
url = print("%s" % link)
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
paragraphs = re.findall(r'</a><br /><br />(.*?)</div>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span></div>', str(respData))
for eachP in paragraphs:
print(eachP)
title = re.findall(r'<title>(.*?)</title>', str(respData))
for eachT in title:
print(eachT)
Your code:
for link in links:
url = print("%s" % link)
assigns None to url. Perhaps you mean:
for link in links:
url = "%s" % link.get("href")
There's also no reason to use urllib to get the sites content, you can use requests as you did before by changing:
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
to
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
Now you can get the title and paragraph with just:
title = soup.find('div', {'class': 'dev-title-container'}).h1.text
paragraph = soup.find('div', {'class': 'text block'}).text

Categories

Resources