Scraping site missing data - python

So I'm trying to scrape the open positions on this site and when I use any type of requests (currently trying request-html) it doesn't show everything that's in the HTML.
# Import libraries
import time
from bs4 import BeautifulSoup
from requests_html import HTMLSession
# Set the URL you want to webscrape from
url = 'https://germanamerican.csod.com/ux/ats/careersite/5/home?c=germanamerican'
session = HTMLSession()
# Connect to the URL
response = session.get(url)
response.html.render()
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html5lib")
b = soup.findAll('a')
Not sure where to go. Originally thought the problem was due to javascript rendering but this is not working.

The issue is that the initial GET doesn't get the data (which I assume is the job listings), and the js that does do that, uses a POST with a authorization token in the header. You need to get this token and then make the POST to get the data.
This token appears to be dynamic so we're going to get a little wonky getting it, but doable.
url0=r'https://germanamerican.csod.com/ux/ats/careersite/5/home?c=germanamerican'
url=r'https://germanamerican.csod.com/services/x/career-site/v1/search'
s=HTMLSession()
r=s.get(url0)
print(r.status_code)
r.html.render()
soup=bs(r.text,'html.parser')
scripts=soup.find_all('script')
for script in scripts:
if 'csod.context=' in script.text: x=script
j=json.loads(x.text.replace('csod.context=','').replace(';',''))
payload={
'careerSiteId': 5,
'cities': [],
'countryCodes': [],
'cultureId': 1,
'cultureName': "en-US",
'customFieldCheckboxKeys': [],
'customFieldDropdowns': [],
'customFieldRadios': [],
'pageNumber': 1,
'pageSize': 25,
'placeID': "",
'postingsWithinDays': None,
'radius': None,
'searchText': "",
'states': []
}
headers={
'accept': 'application/json; q=1.0, text/*; q=0.8, */*; q=0.1',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'authorization': 'Bearer '+j['token'],
'cache-control': 'no-cache',
'content-length': '272',
'content-type': 'application/json',
'csod-accept-language': 'en-US',
'origin': 'https://germanamerican.csod.com',
'referer': 'https://germanamerican.csod.com/ux/ats/careersite/5/home?c=germanamerican',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
r=s.post(url,headers=headers,json=payload)
print(r.status_code)
print(r.json())
the r.json() thats printed out is a nice json format version of the table of job listings.

I don't think it's possible to scrape that website with Requests.
I would suggest using Selenium or Scrapy.

Welcome to SO!
Unfortunately, you won't be able to scrape that page with requests (nor requests_html or similar libraries) because you need a tool to handle dynamic pages - i.e., javascript-based.
With python, I would strongly suggest selenium and its webdriver. Below a piece of code that prints the desired output - i.e., all listed jobs (NB it requires selenium and Firefox webdriver to be installed and with the correct PATH to run)
# Import libraries
from bs4 import BeautifulSoup
from selenium import webdriver
# Set the URL you want to webscrape from
url = 'https://germanamerican.csod.com/ux/ats/careersite/5/home?c=germanamerican'
browser = webdriver.Firefox() # initialize the webdriver. I use FF, might be Chromium or else
browser.get(url) # go to the desired page. You might want to wait a bit in case of slow connection
page = browser.page_source # this is the page source, now full with the listings that have been uploaded
soup = BeautifulSoup(page, "lxml")
jobs = soup.findAll('a', {'data-tag' : 'displayJobTitle'})
for j in jobs:
print(j.text)
browser.quit()

Related

login with requests and BeautifulSoup to scrape pages

I need to scrape a page that requires login to access.
I tried to login with the saved logins info converted in cUrl, using requests and BeautifulSoup but it doesn't work.
I need to login on 'https://www.seoprofiler.com/account/login'
And then scrape pages like: 'https://www.seoprofiler.com/lp/links?q=test.com'
Here's my code:
from bs4 import BeautifulSoup
import requests
cookies = {
'csrftoken': 'token123',
'seoprofilersession': 'session123',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'sec-ch-ua': '^\\^',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'https://www.seoprofiler.com',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://www.seoprofiler.com/account/login',
'Accept-Language': 'en,en-US;q=0.9,it;q=0.8',
}
data = {
'csrfmiddlewaretoken': 'token123',
'username': 'email123#gmail.com',
'password': 'pass123!',
'button': ''
}
response = requests.post('https://www.seoprofiler.com/account/login',
headers=headers, cookies=cookies, data=data)
url = 'https://www.seoprofiler.com/lp/links?q=test.com'
response = requests.get(url, headers= headers, cookies=cookies)
soup = BeautifulSoup(response.content, 'html.parser')
soup.encode('utf-8')
print(soup.title)
I would not use selenium as I have to scrape a lot of data and it would require a lot of time with selenium.
How can I login in order to scrape pages logged in?
Thank you!
You could use requests.Session!
After some trial and error I was able to log in and get the project page using the following script:
import requests
session = requests.Session() # Create new session
session.get(
"https://www.seoprofiler.com/account/login"
) # set seoprofilersession and csrftoken cookies
session.post(
"https://www.seoprofiler.com/account/login",
data={
"csrfmiddlewaretoken": session.cookies.get_dict()["csrftoken"],
"username": "your_email",
"password": "your_password",
},
) # login, sets needed cookies
# Now use this session to get all data you need!
resp = session.get(
"https://www.seoprofiler.com/project/google.com-fa1b9c855721f3d5"
) # get main page content
print(resp.status_code) # my output: 200
Edited:
Just checked one more thing and it appears that it is not mandatory to retrieve seoprofilersession and csrftoken cookies and you can just simply call login post with your credentials (without csrfmiddlewaretoken and then use your session)
How do you know what data structure you must pass to your login page?
A more confident solution uses selenium to fill the username and password fields of the login page and then click on the login button. Next, go to the desired page and scrape that.

How to do reverse image search on google by uploading image url?

My goal is to automate google reverse image search.
I would like to upload an image url and get all the website links that include the matching image.
So here is what I could produce so far:
import requests
import bs4
# Let's take a picture of Chicago
chicago = 'https://images.squarespace-cdn.com/content/v1/556e10f5e4b02ae09b8ce47d/1531155504475-KYOOS7EEGVDGMMUQQNX3/ke17ZwdGBToddI8pDm48kCf3-plT4th5YDY7kKLGSZN7gQa3H78H3Y0txjaiv_0fDoOvxcdMmMKkDsyUqMSsMWxHk725yiiHCCLfrh8O1z4YTzHvnKhyp6Da-NYroOW3ZGjoBKy3azqku80C789l0h8vX1l9k24HMAg-S2AFienIXE1YmmWqgE2PN2vVFAwNPldIHIfeNh3oAGoMooVv2g/Chi+edit-24.jpg'
# And let's take google image search uploader by url
googleimage = 'https://www.google.com/searchbyimage?&image_url='
# Here is our Chicago image url uploaded into google image search
url = googleimage+chicago
# And now let's request our Chichago google image search
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text,'html.parser')
# Here is the output
print(soup.prettify())
My problem is that I did not expect this print(soup.prettify())output.
I am not including the output in the post because it's too long.
If you type in your browser:
https://www.google.com/searchbyimage?&image_url=https://images.squarespace-cdn.com/content/v1/556e10f5e4b02ae09b8ce47d/1531155504475-KYOOS7EEGVDGMMUQQNX3/ke17ZwdGBToddI8pDm48kCf3-plT4th5YDY7kKLGSZN7gQa3H78H3Y0txjaiv_0fDoOvxcdMmMKkDsyUqMSsMWxHk725yiiHCCLfrh8O1z4YTzHvnKhyp6Da-NYroOW3ZGjoBKy3azqku80C789l0h8vX1l9k24HMAg-S2AFienIXE1YmmWqgE2PN2vVFAwNPldIHIfeNh3oAGoMooVv2g/Chi+edit-24.jpg
You will see that the html code is way different from our output with soup.
I was expecting the soup code to have the final results so I can parse the links I need. Instead I only got some weird functions that I don't really understand.
It seems that google image search is a three step process: first you upload your image, then something happens with weird functions, then you get your final results.
How can I get my final results just like in my browser? So I can parse the html code like usual.
Let me explain for you.
use print(response.history)
And print(response.url
So if it's 200, then you will get a url such as https://www.google.com/search?tbs=sbi:
But if it's 302, then you will get a url such as hhttps://www.google.com/webhp?tbs=sbi:
For 302 that's means that Google detected you as a BOT and therefore it's denied you by webhp = Web Hidden Path which it's convert the request to for robots detection and further analyze by google side.
You can confirm that if you pressed on your link Click Here and check what's will appear on the browser bar.
Which means that you need to consider header part in order to be on right track.
Use the following way.
from bs4 import BeautifulSoup
import requests
headers = {
'Host': 'www.google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.google.com/',
'Origin': 'https://www.google.com',
'Connection': 'keep-alive',
'Content-Length': '0',
'TE': 'Trailers'
}
r = requests.get("https://www.google.com/searchbyimage?image_url=https://images.squarespace-cdn.com/content/v1/556e10f5e4b02ae09b8ce47d/1531155504475-KYOOS7EEGVDGMMUQQNX3/ke17ZwdGBToddI8pDm48kCf3-plT4th5YDY7kKLGSZN7gQa3H78H3Y0txjaiv_0fDoOvxcdMmMKkDsyUqMSsMWxHk725yiiHCCLfrh8O1z4YTzHvnKhyp6Da-NYroOW3ZGjoBKy3azqku80C789l0h8vX1l9k24HMAg-S2AFienIXE1YmmWqgE2PN2vVFAwNPldIHIfeNh3oAGoMooVv2g/Chi+edit-24.jpg&encoded_image=&image_content=&filename=&hl=en", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
print(soup.prettify)

Scrape a dynamic(AJAX) website using selenium in Python

I have an AJAX based website https://stackshare.io/application_and_data. I am trying to scrape the logos of tech-stacks across all the pages. I used selenium to find_element_by_class--it's returning an empty list. The JQuery found in the XHR request does not have a URL which I can use. Help needed in reverse-engineer the jQuery script.
The other URLs I found in the Network data also seem to fail. I tried postman to replicate the request,but could not do it correctly.
Any help is very much appreciated.
import time
import requests
from bs4 import BeautifulSoup
import urlparse
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Firefox(executable_path="/home/Documents/geckodriver")
driver.get("https://stackshare.io/application_and_data/")
content = driver.find_elements_by_class_name("btn btn-ss-alt btn-lg load-more-layer-stacks")
content_1 = driver.find_elements_by_class_name("div-center hidden-xs")
Content and content_1 give an empty list. How do I proceed or what am I oding wrong here?
Following is the reverse engineering approach I tried.
request_url = 'https://stackshare.io/application_and_data/load-more'
request_headers = {
'Accept' : '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language' : 'en-GB,en;q=0.5',
'Connection' : 'keep-alive',
'Content-Length' : '128',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'cookie' :'_stackshare_production_session=cUNIOVlrV0h2dStCandILzJDWmVReGRlaWI1SjJHOWpYdDlEK3BzY2JEWjF3Lzd6Z0F6Zmg1RjUzNGo0U1dPNFg2WHdueDl5VEhCSHVtS2JiaVdNN0FvRWJMV0pBS0ZaZ0RWYW14bFFBcm1OaDV6RUptZlJMZ29TQlNOK1pKOFZ3NTVLbEdmdjFhQnRLZDl1d29rSHVnPT0tLWFzQlcrcy9iQndBNW15c0lHVHlJNkE9PQ%3D%3D--b0c41a10e8b0cf8cd020f7b07d6507894e50a9c5; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%224cf45ffc-a1ab-4048-94ba-d8c58063df95%22; wooTracker=Psbca0UX84Do; _ga=GA1.2.877065752.1528363377; amplitude_id_63407ddf709a227ea844317f20f7b56estackshare.io=eyJkZXZpY2VJZCI6IjcwYmNiMGQ3LTM1MjAtNDgzZi1iNWNlLTdmMTIzYzQxZGEyMVIiLCJ1c2VySWQiOm51bGwsIm9wdE91dCI6ZmFsc2UsInNlc3Npb25JZCI6MTUyODgwNTg2ODQ0NiwibGFzdEV2ZW50VGltZSI6MTUyODgwNjc0Nzk2OSwiZXZlbnRJZCI6ODUsImlkZW50aWZ5SWQiOjUsInNlcXVlbmNlTnVtYmVyIjo5MH0=; uvts=7an3MMNHYn0XBZYF; __atuvc=3%7C23; _gid=GA1.2.685188865.1528724539; amplitude_idundefinedstackshare.io=eyJvcHRPdXQiOmZhbHNlLCJzZXNzaW9uSWQiOm51bGwsImxhc3RFdmVudFRpbWUiOm51bGwsImV2ZW50SWQiOjAsImlkZW50aWZ5SWQiOjAsInNlcXVlbmNlTnVtYmVyIjowfQ==; _gat=1; _gali=wrap',
'Host' :'stackshare.io',
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Referer' :'https://stackshare.io/application_and_data',
'X-CSRF-Token' : 'OEhhwcDju+WcpweukjB09hDFPDhwqX…nm+4fAgbMceRxnCz7gg4g//jDEg==',
'X-Requested-With' : 'XMLHttpRequest'
}
payload = {}
response = requests.post(request_url, data=payload, headers=request_headers)
print response
Observation: I got a 499 Response code. What payload do I need to give?
I checked the XHR request, but could not find the correct URL,it leads to.

Put data into a list from webpage (splinter)

I am doing a little bot, that should give information from website (ebay) and put into a list using splinter and python. My first lines of code:
from splinter import Browser
with Browser() as browser:
url = "http://www.ebay.com"
browser.visit(url)
browser.fill('_nkw', 'levis')
button = browser.find_by_id('gh-btn')
button.click()
How I can put information that in red frame to list, using information from web page?
Like : [["Levi Strauss & Co. 513 Slim Straight Jean Ivory Men's SZ", 12.99, 0], ["Levi 501 Jeans Mens Original Levi's Strauss Denim Straight", 71.44, "Now"], ["Levis 501 Button Fly Jeans Shrink To Fit Many Sizes", [$29.99, $39.99]]]
This is not perfect answer, but it should work.
first thing install these two module
requests and BS4:
pip install requests
pip install beautifulsoup4
import requests
import json
from bs4 import BeautifulSoup
#setting up the headers
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://www.ebay.com/',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8',
'Host': 'www.ebay.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
}
#setting up my proxy, you can disable it
proxy={
'https':'127.0.0.1:8888'
}
#search terms
search_term='armani'
#request session begins
ses=requests.session()
#first get home page so to set cookies
resp=ses.get('https://www.ebay.com/',headers=headers,proxies=proxy,verify=False)
#next get the search term page to parse request
resp=ses.get('https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2374313.m570.l1313.TR12.TRC2.A0.H0.X'+search_term+'.TRS0&_nkw='+search_term+'&_sacat=0',
headers=headers,proxies=proxy,verify=False)
soup = BeautifulSoup(resp.text, 'html.parser')
items=soup.find_all('a', { "class" : "vip" })
price_items=soup.find_all('span', { "class" : "amt" })
final_list=list()
for item,price in zip(items,price_items):
try:
title=item.getText()
price_val=price.find('span',{"class":"bold"}).getText()
final_list.append((title,price_val))
except Exception as ex:
pass
print(final_list)
This is the output that I got
I agree with #Aki003, Something like this
def get_links(ebay_url):
page = requests.get(ebay_url).text
soup = BeautifulSoup(page)
links = []
for item in soup.find_all('a'):
links.append(item.get('href'))
return(links)
You can scrape for any other element on the webpage. Check the beautifulsoup documentation.

google search with python requests library

(I've tried looking but all of the other answers seem to be using urllib2)
I've just started trying to use requests, but I'm still not very clear on how to send or request something additional from the page. For example, I'll have
import requests
r = requests.get('http://google.com')
but I have no idea how to now, for example, do a google search using the search bar presented. I've read the quickstart guide but I'm not very familiar with HTML POST and the like, so it hasn't been very helpful.
Is there a clean and elegant way to do what I am asking?
Request Overview
The Google search request is a standard HTTP GET command. It includes a collection of parameters relevant to your queries. These parameters are included in the request URL as name=value pairs separated by ampersand (&) characters. Parameters include data like the search query and a unique CSE ID (cx) that identifies the CSE that is making the HTTP request. The WebSearch or Image Search service returns XML results in response to your HTTP requests.
First, you must get your CSE ID (cx parameter) at Control Panel of Custom Search Engine
Then, See the official Google Developers site for Custom Search.
There are many examples like this:
http://www.google.com/search?
start=0
&num=10
&q=red+sox
&cr=countryCA
&lr=lang_fr
&client=google-csbe
&output=xml_no_dtd
&cx=00255077836266642015:u-scht7a-8i
And there are explained the list of parameters that you can use.
import requests
from bs4 import BeautifulSoup
headers_Get = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
def google(q):
s = requests.Session()
q = '+'.join(q.split())
url = 'https://www.google.com/search?q=' + q + '&ie=utf-8&oe=utf-8'
r = s.get(url, headers=headers_Get)
soup = BeautifulSoup(r.text, "html.parser")
output = []
for searchWrapper in soup.find_all('h3', {'class':'r'}): #this line may change in future based on google's web page structure
url = searchWrapper.find('a')["href"]
text = searchWrapper.find('a').text.strip()
result = {'text': text, 'url': url}
output.append(result)
return output
Will return an array of google results in {'text': text, 'url': url} format. Top result url would be google('search query')[0]['url']
input:
import requests
def googleSearch(query):
with requests.session() as c:
url = 'https://www.google.co.in'
query = {'q': query}
urllink = requests.get(url, params=query)
print urllink.url
googleSearch('Linkin Park')
output:
https://www.google.co.in/?q=Linkin+Park
The readable way to send a request with many query parameters would be to pass URL parameters as a dictionary:
params = {
'q': 'minecraft', # search query
'gl': 'us', # country where to search from
'hl': 'en', # language
}
requests.get('URL', params=params)
But, in order to get the actual response (output/text/data) that you see in the browser you need to send additional headers, more specifically user-agent which is needed to act as a "real" user visit when bot or browser sends a fake user-agent string to announce themselves as a different client.
The reason that your request might be blocked is that the default requests user agent is python-requests and websites understand that. Check what's your user agent.
You can read more about it in the blog post I wrote about how to reduce the chance of being blocked while web scraping.
Pass user-agent:
headers = {
'User-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
requests.get('URL', headers=headers)
Code and example in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
'User-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
params = {
'q': 'minecraft',
'gl': 'us',
'hl': 'en',
}
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select('.tF2Cxc'):
title = result.select_one('.DKV0Md').text
link = result.select_one('.yuRUbf a')['href']
print(title, link, sep='\n')
Alternatively, you can achieve the same thing by using Google Organic API from SerpApi. It's a paid API with a free plan.
The difference is that you don't have to create it from scratch and maintain it.
Code to integrate:
import os
from serpapi import GoogleSearch
params = {
"engine": "google",
"q": "tesla",
"hl": "en",
"gl": "us",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results["organic_results"]:
print(result['title'])
print(result['link'])
Disclaimer, I work for SerpApi.
In this code by using bs4 you can get all the h3 and print their text
# Import the beautifulsoup
# and request libraries of python.
import requests
import bs4
# Make two strings with default google search URL
# 'https://google.com/search?q=' and
# our customized search keyword.
# Concatenate them
text= "c++ linear search program"
url = 'https://google.com/search?q=' + text
# Fetch the URL data using requests.get(url),
# store it in a variable, request_result.
request_result=requests.get( url )
# Creating soup from the fetched request
soup = bs4.BeautifulSoup(request_result.text,"html.parser")
filter=soup.find_all("h3")
for i in range(0,len(filter)):
print(filter[i].get_text())
You can use 'webbroser', I think it doesn't get easier than that:
import webbrowser
query = input('Enter your query: ')
webbrowser.open(f'https://google.com/search?q={query}')

Categories

Resources