I'm trying to use Requests to get to the "next" (right arrow) page of this URL:
https://www.sportstats.ca/display-results.xhtml?raceid=43572.
I used the chrome developer tools to examine the response when I do it manually in a browser and I've attempted to put together the form data and make a post with Requests, but the response I'm getting back still shows Page 1 in the content. Any tips? I've also tried using Selenium with mixed results, I'd much rather stick to lightweight Requests if at all possible. Here's my attempt:
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
url = 'https://www.sportstats.ca/display-results.xhtml?raceid=43572'
with requests.Session() as s:
r1 = s.get(url)
pagenum = [x for x in r1.text.splitlines() if '<p>Page' in x][0].strip()
print(pagenum)
soup = BeautifulSoup(r1.text, 'html.parser')
hidden_inputs = soup.findAll('input', {'type': 'hidden'})
prepayload = {x['name']: x['value'] for x in hidden_inputs}
payload = {}
payload['javax.faces.partial.ajax'] = 'true'
payload['javax.faces.source'] = 'mainForm:j_idt386'
payload['javax.faces.partial.execute'] = 'mainForm'
payload['javax.faces.partial.render'] = 'mainForm:result_table mainForm:pageNav mainForm:eventAthleteDetailsDialog'
payload['mainForm:j_idt386'] = 'mainForm:j_idt386'
payload['mainForm'] = prepayload['mainForm']
payload['mainForm:raceid'] = prepayload['mainForm:raceid']
payload['mainForm:status'] = prepayload['mainForm:status']
payload['mainForm:iframe'] = prepayload['mainForm:iframe']
payload['mainForm:bib'] = ''
payload['mainForm:lastname'] = ''
payload['mainForm:city'] = ''
payload['mainForm:firstname'] = ''
payload['mainForm:province'] = ''
payload['mainForm:categoryFilter'] = 'All Categories'
payload['javax.faces.ViewState'] = prepayload['javax.faces.ViewState']
r2 = s.post(url, data=payload)
pagenum = [x for x in r2.text.splitlines() if '<p>Page' in x][0].strip()
print(pagenum)
This comes back with:
[myname#myserver] $ ./sstest.py
<p>Page 1 / 19
<p>Page 1 / 19
The website you want to scrap is better suited for selenium.
All you need is to get the number of total number of pages from visiting the website, then you loop over the total number of pages and click on the next button once every loop.
During every loop you can do the required parsing for each page as you would normally do.
This way you have a script that dynamically parses each page according to the number of pages in the Website page.
Code:
#!/usr/bin/env python
import time
from bs4 import BeautifulSoup
from selenium import webdriver
# Intializations
driver = webdriver.Chrome()
url = 'https://www.sportstats.ca/display-results.xhtml?raceid=43572'
driver.get(url)
driver.maximize_window()
bs = BeautifulSoup(driver.page_source, 'html.parser')
# Retrieve the total number of pages
PagesParser = driver.find_element_by_xpath('//*[#id="mainForm:pageNav"]/div/p')
pages = int(str(PagesParser.text).split('/')[1].replace(' ', ''))
print(pages)
# Loops over every page
for i in range(1, pages+1):
print('page: ' + str(i))
# Do your parsing here for every page
time.sleep(5)
driver.find_element_by_xpath('//*[#id="mainForm:j_idt386"]').click() # Clicks the next button
Output:
19
page: 1
page: 2
page: 3
page: 4
page: 5
page: 6
page: 7
page: 8
page: 9
page: 10
page: 11
page: 12
page: 13
page: 14
page: 15
page: 16
page: 17
page: 18
page: 19
Related
I learning python for web crawling, but i'm totally stuck.
Each time I run this codes, results change.
very rarely, it works but almost return empty list.
why does it happen? please let me know
from indeed import extract_indeed_pages, extract_indeed_jobs
last_indeed_page = extract_indeed_pages()
print(last_indeed_page)
indeed_jobs = extract_indeed_jobs(last_indeed_page)
print(indeed_jobs)
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://kr.indeed.com/jobs?q=React&l=%EC%84%9C%EC%9A%B8&radius=100&jt=fulltime&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_indeed_jobs(last_page):
jobs = []
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("h2", {"class": "jobTitle"})
jobs.append(results)
return jobs
This happens because of the javascript on the source code. You can view the web page by pressing the ctrl + u buttons on your pc.
I would like to scrape all the url links associated with the soccer games included in the table in this website.
Here is the code:
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
url = 'https://www.coteur.com/cotes-foot.php'
driver.get(url)
fixture1 = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div/div[2]/div/table/tbody/tr[3]/td[3]/a")
print(fixture1.text)
links = []
i = 3
while i <= 6:
fixture = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div/div[2]/div/table/tbody/tr[" + str(i) + "]/td[3]/a")
links.append(fixture)
i = i + 3
print(links)
driver.close()
When I scrape one match it returns the data I'm expecting. However, when I tried to make a loop to get all the soccer games I run into a problem.
Here is the result of the code:
Betis Seville - Granada 74 Cf
[<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="0199958a-4d31-4a21-9856-8f8c3cc8ee05", element="158fcdaf-501f-41a4-9550-8a42543acc22")>, <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="0199958a-4d31-4a21-9856-8f8c3cc8ee05", element="74e67896-fccb-48da-8eef-bbf8d9a6f3b3")>]
I wanted to get the first element, but I don't get what I was expecting.
This works well
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.coteur.com/cotes-foot.php")
links = driver.find_elements_by_xpath('//a[contains(#href, "match/cotes-")]')
data = [l.text for l in links]
print(data)
I tried your code, here is the result :
File "./coteur2.py", line 17
data = [l.text for l in links]
^
IndentationError: unexpected indent
I prefer to use this way :
links = driver.find_elements_by_xpath('//a[contains(#href, "match/cotes-")]')
n = 0
while n < len(links):
links[n] = links[n].text
n = n + 1
print(links)
Thanks for your help
I need to scrape all the images of the pages of the url given in the code but i could only do it manually each page till the last page(100th page).
This is the code for scraping each page and i replace the page number each time and run the code!
Down below
Is there any way to add a variable function and running a loop till it gets an error in this case a 404 page (since no more pages would be left)?
from bs4 import*
import requests as rq
r2 = rq.get("https://www.gettyimages.in/photos/aishwarya-rai?family=editorial&page=1&phrase=aishwarya%20rai&sort=mostpopular")
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
x = soup2.select('img[src^="https://media.gettyimages.com/photos/"]') #the frame where it shows the images
for img in x:
links.append(img['src'])
for index, img_link in enumerate(links):
img_data = rq.get(img_link).content
with open("aishwarya_rai/"+str(index+2)+'.jpg', 'wb+') as f:
f.write(img_data)
else:
f.close()
The page ranges from 1 to 100.
I need some additional code which makes the "page value" a variable and loops till 100
Use format() function and pass the page variable.
from bs4 import*
import requests as rq
url="https://www.gettyimages.in/photos/aishwarya-rai?family=editorial&page={}&phrase=aishwarya%20rai&sort=mostpopular"
links = []
for page in range(1,101):
print(url.format(page))
r2 = rq.get(url.format(page))
soup2 = BeautifulSoup(r2.text, "html.parser")
x = soup2.select('img[src^="https://media.gettyimages.com/photos/"]')
for img in x:
links.append(img['src'])
print(links)
I'm new to .NET and Python, but I would like to make a program to scrape .aspx site and work with content there (HTML code is enough). I tried some libraries in Python, but all I got is the first page of that site. Seems like I am building wrong POST data, I don't know the right form of the data, what should be included and what not.
http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018
import requests, urllib, urllib2
r = requests.get("http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018")
content = r.text
print content
start_index = content.find('id="__VIEWSTATE"') + 24
sliced_vs = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__VIEWSTATEGENERATOR"') + 33
sliced_vsg = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__VIEWSTATEENCRYPTED"') + 33
sliced_vse = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__EVENTVALIDATION"') + 30
sliced_EV = content[start_index:content.find('"',start_index)]
form_data = {'__EVENTTARGET': 'gvZverejnenie',
'__EVENTARGUMENT': 'Page$2',
'__VIEWSTATE': sliced_vs,
'__VIEWSTATEGENERATOR': sliced_vsg,
'__VIEWSTATEENCRYPTED': sliced_vse,
'__EVENTVALIDATION': sliced_EV}
data_encoded = urllib.urlencode(form_data)
r = requests.post('http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018',data=data_encoded)
content = r.text
print content
For example, in code I want to get the second page ('Page$2'). I get always the same result, but with different values for ViewState and EventValidation. Where could be the problem, please?
This code requires selenium and chromedriver to control Google Chrome. Turns out there are 476 pages total (following the url you provided).
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018')
with open('page_1.html', 'w') as f:
f.write(driver.page_source)
page_num = 2
while True:
try:
element = driver.find_element_by_link_text(str(page_num))
except NoSuchElementException:
elements = driver.find_elements_by_link_text('...')
if len(elements) == 0:
break # less than 11 pages total
elif len(elements) == 1 and page_num > 12:
break # last page
element = elements[-1]
element.click()
with open('page_{}.html'.format(page_num), 'w') as f:
f.write(driver.page_source)
page_num += 1
driver.quit()
I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1