requests failes to keep logged in session - python

I am trying to scrape some emails from mdpi.com, emails available only to logged in users. But it fails when I am trying to do so. I am getting
when logged out:
Code itself:
import requests
from bs4 import BeautifulSoup
import traceback
login_data = {'form[email]': 'xxxxxxx#gmail.com', 'form[password]': 'xxxxxxxxx', 'remember': 1,}
base_url = 'http://www.mdpi.com'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0'}
session = requests.Session()
session.headers = headers
# log_in
s = session.post('https://susy.mdpi.com/user/login', data=login_data)
print(s.text)
print(session.cookies)
def make_soup(url):
try:
r = session.get(url)
soup = BeautifulSoup(r.content, 'lxml')
return soup
except:
traceback.print_exc()
return None
example_link = 'http://www.mdpi.com/search?journal=medsci&year_from=1996&year_to=2017&page_count=200&sort=relevance&view=default'
def article_finder(soup):
one_page_articles_divs = soup.find_all('div', class_='article-content')
for article_div in one_page_articles_divs:
a_link = article_div.find('a', class_='title-link')
link = base_url + a_link.get('href')
print(link)
article_soup = make_soup(link)
grab_author_info(article_soup)
def grab_author_info(article_soup):
# title of the article
article_title = article_soup.find('h1', class_="title").text
print(article_title)
# affiliation
affiliations_div = article_soup.find('div', class_='art-affiliations')
affiliation_dict = {}
aff_indexes = affiliations_div.find_all('div', class_='affiliation-item')
aff_values = affiliations_div.find_all('div', class_='affiliation-name')
for i, index in enumerate(aff_indexes): # 0, 1
affiliation_dict[int(index.text)] = aff_values[i].text
# authors names
authors_div = article_soup.find('div', class_='art-authors')
authors_spans = authors_div.find_all('span', class_='inlineblock')
for span in authors_spans:
name_and_email = span.find_all('a') # name and email
name = name_and_email[0].text
# email
email = name_and_email[1].get('href')[7:]
# affiliation_index
affiliation_index = span.find('sup').text
indexes = set()
if len(affiliation_index) > 2:
for i in affiliation_index.strip():
try:
ind = int(i)
indexes.add(ind)
except ValueError:
pass
print(name)
for index in indexes:
print('affiliation =>', affiliation_dict[index])
print('email: {}'.format(email))
if __name__ == '__main__':
article_finder(make_soup(example_link))
What should I do in order to get what I want?

Ah that is easy, you haven't managed to log in correctly. If you look at the response from your initial call you will see that you are returned the login page HTML instead of the my profile page. The reason for this is that you are not submitted the hidden token on the form.
The solution request the login page, and then use either lxml or BeautifulSoup to parse the hidden input 'form[_token]'. Get that value and then add it to your login_data payload.
Then submit your login request and you'll be in.

Related

Failed to parse content from a webpage using requests

I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.
Here goes the manual steps to reach the content:
Choose the first item from dropdown.
Get the links to the detail page.
Grab these two fields from detail page.
While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.
Script without session:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def grab_first_link_from_dropdown(link):
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link,headers=headers)
str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
soup = BeautifulSoup(res.text,"html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield str_cookie,inner_link
def get_content(str_cookie,inner_link):
headers['Cookie'] = str_cookie
res = requests.get(inner_link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: descripcion = ""
return expediente,descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie,detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie,detail_page_link))
What possible change should I bring about to make the script work?
There's a redirect that occurs on fetch_detail_page_link. Python Requests follows redirects by default. When your script obtains the cookies, it is only grabbing the cookies for the final request in the chain. You must access the history field of the response to see the redirects that were followed. Doing this with a Session object worked because it was preserving those cookies for you.
I must agree with others who have commented that it really would be a good idea to use a Session object for this. However if you insist on not using Session, your script would look like this:
import re
import requests
from requests.cookies import RequestsCookieJar
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': "Scraping Your Vigentes 1.0",
}
def grab_first_link_from_dropdown(link):
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
category_link = urljoin(base, soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link, headers=headers)
cookies = RequestsCookieJar() # create empty cookie jar
for r in res.history:
cookies.update(r.cookies) # merge in cookies from each redirect response
cookies.update(res.cookies) # merge in cookies from the final response
soup = BeautifulSoup(res.text, "html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'", target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield cookies, inner_link
def get_content(cookies, inner_link):
res = requests.get(inner_link, headers=headers, cookies=cookies)
if not res.ok:
print("Got bad response %s :(" % res.status_code)
return "", ""
soup = BeautifulSoup(res.text, "html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
descripcion = ""
return expediente, descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie, detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie, detail_page_link))

How do I search within a website using the 'requests' module?

I want to search for different company names on the website. Website link: https://www.firmenwissen.de/index.html
On this website, I want to use the search engine and search companies. Here is the code I am trying to use:
from bs4 import BeautifulSoup as BS
import requests
import re
companylist = ['ABEX Dachdecker Handwerks-GmbH']
url = 'https://www.firmenwissen.de/index.html'
payloads = {
'searchform': 'UFT-8',
'phrase':'ABEX Dachdecker Handwerks-GmbH',
"mainSearchField__button":'submit'
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
html = requests.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'html.parser')
link_list= []
links = soup.findAll('a')
for li in links:
link_list.append(li.get('href'))
print(link_list)
This code should bring me the next page with company information. But unfortunately, it returns only the home page. How can I do this?
Change your initial url you are doing search for. Grab the appropriate hrefs only and add to a set to ensure no duplicates (or alter selector to return only one match if possible); add those items to a final set for looping to ensure only looping required number of links. I have used Session on assumption you will repeat for many companies.
Iterate over the set using selenium to navigate to each company url and extract whatever info you need.
This is an outline.
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = set()
## searches section; gather into set
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLinks = {baseUrl + item['href'] for item in soup.select("[href*='firmeneintrag/']")}
# print(soup.select_one('.fp-result').text)
finalLinks = finalLinks.union(companyLinks)
for item in finalLinks:
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Just the first links:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH', 'aktive Stuttgarter']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = []
## searches section; add to list
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLink = baseUrl + soup.select_one("[href*='firmeneintrag/']")['href']
finalLinks.append(companyLink)
for item in set(finalLinks):
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()

How to extend the scraping done to more than 1st page using python

Hi I was going through a Python code (pasted below). The code is working fine for scraping the 1st page results (25 listing per page). However, I would want to extend its usability to scrape results from atleast 10 more pages
For example I want to generate results for zip code - 98021 which has 80 listings in total (until page 4). However, when I run the code below using python zillow.py 980021 newest, it displays only 25 listings
Since I am a newbie in python I request you to please help me out with this objective.
from lxml import html
import requests
import unicodecsv as csv
import argparse
def parse(zipcode,filter=None):
if filter=="newest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
elif filter == "cheapest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
else:
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
for i in range(10):
# try:
headers= {
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding':'gzip, deflate, sdch, br',
'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
response = requests.get(url,headers=headers)
print(response.status_code)
parser = html.fromstring(response.text)
search_results = parser.xpath("//div[#id='search-results']//article")
properties_list = []
for properties in search_results:
raw_address = properties.xpath(".//span[#itemprop='address']//span[#itemprop='streetAddress']//text()")
raw_city = properties.xpath(".//span[#itemprop='address']//span[#itemprop='addressLocality']//text()")
raw_state= properties.xpath(".//span[#itemprop='address']//span[#itemprop='addressRegion']//text()")
raw_postal_code= properties.xpath(".//span[#itemprop='address']//span[#itemprop='postalCode']//text()")
raw_price = properties.xpath(".//span[#class='zsg-photo-card-price']//text()")
raw_info = properties.xpath(".//span[#class='zsg-photo-card-info']//text()")
raw_broker_name = properties.xpath(".//span[#class='zsg-photo-card-broker-name']//text()")
url = properties.xpath(".//a[contains(#class,'overlay-link')]/#href")
raw_title = properties.xpath(".//h4//text()")
address = ' '.join(' '.join(raw_address).split()) if raw_address else None
city = ''.join(raw_city).strip() if raw_city else None
state = ''.join(raw_state).strip() if raw_state else None
postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
price = ''.join(raw_price).strip() if raw_price else None
info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',')
broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
title = ''.join(raw_title) if raw_title else None
property_url = "https://www.zillow.com"+url[0] if url else None
is_forsale = properties.xpath('.//span[#class="zsg-icon-for-sale"]')
properties = {
'address':address,
'city':city,
'state':state,
'postal_code':postal_code,
'price':price,
'facts and features':info,
'real estate provider':broker,
'url':property_url,
'title':title
}
if is_forsale:
properties_list.append(properties)
return properties_list
# except:
# print ("Failed to process the page",url)
if __name__=="__main__":
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
argparser.add_argument('zipcode',help = '')
sortorder_help = """
available sort orders are :
newest : Latest property details,
cheapest : Properties with cheapest price
"""
argparser.add_argument('sort',nargs='?',help = sortorder_help,default ='Homes For You')
args = argparser.parse_args()
zipcode = args.zipcode
sort = args.sort
print ("Fetching data for %s"%(zipcode))
scraped_data = parse(zipcode,sort)
print ("Writing data to output file")
with open("properties-%s.csv"%(zipcode),'wb')as csvfile:
fieldnames = ['title','address','city','state','postal_code','price','facts and features','real estate provider','url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in scraped_data:
writer.writerow(row)
You need to grab the link to the next page from the current page and then update the url that you use to scrape from.
Here's a rough example of how that could work:
def parse(zipcode, url, filter=None):
# get results how you are
# get url from next page button
return results, next_page_url
full_results = []
results, next_page_url = parse(zipcode, initial_page_url, filter=filter)
full_results += results
while (len(results) >= 25 and next_page_url):
results, next_page_url = parse(zipcode, next_page_url, filter=filter)
full_results += results
So in this example parse takes the url to scrape from as a second positional argument and returns the results and the url for the next page to scrape.
This will just keep scraping as long as there are the max results (25) on the page and a url to the next page is returned.

Scraping reviews from tripadvisor

Suppose I am scraping a reviews from the url
https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html
It contents no of pages which contains the reviews which I want to scrape. So how can I scrape the reviews of all the next pages.
I used the below code but still shows only the reviews in first page only!
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html"
MAX_PAGES = 30
counter = 0
for i in range(1, MAX_PAGES):
if i > 1:
url = "%spage/%d/" % (URL_BASE, i)
else:
url = URL_BASE
req = requests.get(url)
statusCode = req.status_code
if statusCode == 200:
html = BeautifulSoup(req.text, "html.parser")
resultsoup = html.find_all('P', {'class': 'partial_entry'})
else:
break
for review in resultsoup:
review_list = review.get_text()
print(review_list)
Based on example for scrapy.
Server adds to url (in any place before .html)
-or5 to get second page,
-or10 to get third page,
etc.
You could even skip words (which are for SEO) and use only
https://www.tripadvisor.com/g562819-d289642-or5.html
https://www.tripadvisor.com/g562819-d289642-or10.html
to get next pages with reviews.
from bs4 import BeautifulSoup
import requests
import re
#import webbrowser
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
num_reviews = response.find('span', class_='reviews_header_count').text
num_reviews = num_reviews[1:-1] # remove `( )`
num_reviews = num_reviews.replace(',', '') # remove `,`
num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('.html', '-or{}.html')
print('template:', url)
# load pages with reviews
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
# get every review
for idx, review in enumerate(response.find_all('div', class_='review-container')):
item = {
'hotel_name': response.find('h1', class_='heading_title').text,
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
'num_reviews_reviewer': review.find('span', class_='badgetext').text,
'reviewer_name': review.find('span', class_='scrname').text,
'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}
results.append(item) # <--- add to global list
#~ yield item
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
# --- main ---
s = requests.Session()
start_urls = [
'https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html',
]
results = [] # <--- global list for items
for url in start_urls:
parse(url, get_soup(url))
import pandas as pd
df = pd.DataFrame(results) # <--- convert list to DataFrame
df.to_csv('output.csv') # <--- save in file

logging in to website using requests

I've tried two completely different methods. But still I can't get the data that is only present after loggin in.
I've tried doing one using requests but the xpath returns a null
import requests
from lxml import html
USERNAME = "xxx"
PASSWORD = "xxx"
LOGIN_URL = "http://www.reginaandrew.com/customer/account/loginPost/referer/aHR0cDovL3d3dy5yZWdpbmFhbmRyZXcuY29tLz9fX19TSUQ9VQ,,/"
URL = "http://www.reginaandrew.com/gold-leaf-glass-top-table"
def main():
FormKeyTxt = ""
session_requests = requests.session()
# Get login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
# Create payload
formKey = str((tree.xpath("//*[ # id = 'login-form'] / input / # value")))
FormKeyTxt = "".join(formKey)
#print(FormKeyTxt.replace("['","").replace("']",""))
payload = {
"login[username]": USERNAME,
"login[password]": PASSWORD,
"form_key": FormKeyTxt,
"persistent_remember_me": "checked"
}
# Perform login
result = session_requests.post(LOGIN_URL, data=payload)
# Scrape url
result = session_requests.get(URL, data=payload)
tree = html.fromstring(result.content)
bucket_names = tree.xpath("//span[contains(#class, 'in-stock')]/text()")
print(bucket_names)
print(result)
print(result.status_code)
if __name__ == '__main__':
main()
ive tried another one using Mechanical soup but still it returns a null
import argparse
import mechanicalsoup
import urllib.request
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser(description='Login to GitHub.')
parser.add_argument("username")
parser.add_argument("password")
args = parser.parse_args()
browser = mechanicalsoup.Browser()
login_page = browser.get("http://www.reginaandrew.com/gold-leaf-glass-top-table")
login_form = login_page.soup.select("#login-form")[0]
login_form.input({"login[username]": args.username, "login[password]": args.password})
page2 = browser.submit(login_form,login_page.url )
messages = page2.soup.find(class_='in-stock1')
if messages:
print(messages.text)
print(page2.soup.title.text)
I understand the top solution better so id like to do it using that but is there anything I'm missing? (I'm sure I'm missing a lot)
This should do it
import requests
import re
url = "http://www.reginaandrew.com/"
r = requests.session()
rs = r.get(url)
cut = re.search(r'<form.+?id="login-form".+?<\/form>', rs.text, re.S|re.I).group()
action = re.search(r'action="(.+?)"', cut).group(1)
form_key = re.search(r'name="form_key".+?value="(.+?)"', cut).group(1)
payload = {
"login[username]": "fugees",
"login[password]": "nugees",
"form_key": form_key,
"persistent_remember_me": "on"
}
rs = r.post(action, data=payload, headers={'Referer':url})

Categories

Resources