Scraping data through paginated table using python

Scraping data through paginated table using python - python

I am scraping data through google finance's historical page for a stock (http://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=PLfUVIDTDuSRiQKhwYGQBQ).
I can scrape the 30 rows on the current page. The issue I am facing is that I am unable to scrape through the rest of data in the table (31-241 rows). How do I go to the next page or link.
Following is my code:
import urllib2
import xlwt #to write into excel spreadsheet
from bs4 import BeautifulSoup
# Main Coding Section
stock_links = open('stock_link_list.txt', 'r') #opening text file for reading
#url="https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=zHXOVLPnApG2iALxxYCADQ"
for url in stock_links:
OurFile = urllib2.urlopen(url)
OurHtml = OurFile.read()
OurFile.close()
soup = BeautifulSoup(OurHtml)
#soup1 = soup.find("div", {"class": "gf-table-wrapper sfe-break-bottom-16"}).get_text()
soup1 = soup.find("table", {"class": "gf-table historical_price"}).get_text()
end = url.index('&')
filename = url[47:end]
file = open(filename, 'w') #opening text file for writing
file.write(soup1)
#file.write(soup1.get_text()) #writing to the text file
file.close() #closing the text file

You will have to fine tune it and I would catch more specific errors but you can keep increasing the start to get the next data:
url = "https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=W8LUVLHnAoOswAOFs4DACg&start={}&num=30"
from bs4 import BeautifulSoup
import requests
# Main Coding Sectio
start = 0
while True:
try:
nxt = url.format(start)
r = requests.get(nxt)
soup = BeautifulSoup(r.content)
print(soup.find("table",{"class": "gf-table historical_price"}).get_text())
except Exception as e:
print(e)
break
start += 30
This gets all the table data up to the last date feb 7 :
......
Date
Open
High
Low
Close
Volume
Feb 7, 2014
552.60
557.90
548.25
551.50
119,711

Looking at first sight the Row Limit option allows to shows maximum 30 row per page but I manually changed query string parameters to greater numbers and realizes we can view max 200 rows per page
Change URL to
https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=OM3UVLFtkLnzBsjIgYAI&start=0&num=200
It will show 200 rows
And then change start=200&num=400
But more logically, if you have many other sunch kind of links.
Then you can scrape the Pagination area, the last TR and grab those links of next pages and scrape

Related

How to fix this web scraping program?

Im a complete beginner with python, and I wrote this program to scrape and find closing odds of NHL games off The Score website, and put that data in a file. The program runs but for some reason only 2 games out of the about 200 I tried show up with incorrect data.
I think it is because of how I am search through divs within a div, I wrote the code that returns the data from that in a way that it only stores that last div (which conveniently is the div im looking to scrape).
Also im sure my way of writing to the file is poor for runtime, is there a better way to do this?
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(response.content, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
# Code to find div and search for div within
divs = soup.find('div',{'class': 'col-sm-4'})
for tag in divs:
# find div
target = tag.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
# find divs within target div
odds = tag.find("div", {"class","GameDetailsCard__content--2L_KF"})
# Call write_to_file -> add data scraped from web
write_to_file(matchtitle.text +" "+ odds.text)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
file = open("NHL_GAMES.txt","a")
file.write(game_data +"\n")
file.close
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)
Here is one line in the text file with correct data:
Toronto Maple Leafs # New Jersey Devils on November 24, 2022 NJD -140, o/u 6.5
Here is one with incorrect data
Carolina Hurricanes # Anaheim Ducks on December 7, 2022 Justin St. Pierre, Chris Lee
Only 2/215 were wrong like this.

It looks like that certain NHL game webpages ex: Carolina does not contain a <div> section for the 'Odds', this might be due to then being OT games? Regardless best bet is to add in a clause to handle 'no odds found'. I have updated some of your code below:
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
results = []
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
target = soup.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
if "Odds" in str(tag.find("div", {"class":"GameDetailsCard__label--iBMhJ"})):
odds = str(tag.find("div", {"class":"GameDetailsCard__content--2L_KF"}).text)
else:
odds = "No Odds found!"
print(matchtitle.text + " " + odds)
results.append(matchtitle.text + " " + odds)
# Call write_to_file -> add data scraped from web
write_to_file(results)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
print("Getting game details...")
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
with open("NHL_GAMES.txt", "a") as file:
for line in game_data:
file.write(line + "\n")
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)

Stuck Scraping with Beautifulsoup

So i'm trying to scrape a html webpage. It has novel chapters and i'm trying to get the text and store in text files to read offline. I don't have any previous experience with html or other things either. So the webpage I am trying to scrape is this. And the code i've been testing so far looks like this
`
import sys
import requests
import time
import re
from bs4 import BeautifulSoup
def browse_and_scrape(seed_url, page_number=1):
# Fetch the URL - We will be using this to append to images and info routes
url_pat = re.compile(r"(http://.*\.org)")
source_url = url_pat.search(seed_url).group(0)
# Page_number from the argument gets formatted in the URL & Fetched
formatted_url = seed_url.format(str(page_number))
# print(url_pat,source_url,formatted_url)
try:
html_text = requests.get(formatted_url).text
# print(html_text)
# Prepare the soup
soup = BeautifulSoup(html_text, "html.parser")
print(soup.find_all(id="chapterContent")[0]["style"])
print(f"Now Scraping - {formatted_url}")
# help = soup.find_all("div",class_="chapter-content text-normal")[0].text.strip().encode("ascii", "ignore").decode("ascii")
# for node in soup.findAll("div",class_="chapter-content text-normal"):
# print(node)
# print(''.join(node.findAll(text=True)))
# for node in soup.findAll("div"):
# # print(node)
# print(''.join(node.findAll(text=True)))
# help = soup.find_all("div",class_="chapter-content text-normal")[0]
# print(''.join(help.findAll(text=True)))
# print(help)
except Exception as e:
return e
return true
if __name__ == "__main__":
# seed_url = "http://books.toscrape.com/catalogue/page-{}.html"
seed_url = "http://wnmtl.org/chapter/324909-heavenly-wolf-valley.html"
# seed_url = "http://wnmtl.org/chapter/{}.html"
print("Web scraping has begun")
result = browse_and_scrape(seed_url)
if result == True:
print("Web scraping is now complete!")
else:
print(f"Oops, That doesn't seem right!!! - {result}")`
All the commented stuff are things i've been trying to rip the text from the tag. From my inspection of the developer console in the browser, all the text is in the tag with id of chapter content. My plan is to iteratively get the text, stuff it, get the link for the next page and repeat but i've been stuck for a bit now, any suggestions.

Instead of scraping each page, you can directly get the text from this API endpoint using requests.
https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/324909
The last item in the above API is the chapter ID (324909). You can navigate to chapters by giving in the chapter IDs.
The next and prev chapter IDs are present in the current chapter's API endpoint. Have a look at the above URL in browser to understand it better.
Here is the full recursive code that writes the text from 3 pages to a file called novel.txt. You may change the number of pages and other details as per your need.
import requests
def get_data(chapter_id, pages):
if pages == 0:
return
url = 'https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/' + str(chapter_id)
r = requests.get(url)
x = r.json()
pre_id = x['data']['preId']
next_id = x['data']['nextId']
title = x['data']['title']
content = x['data']['content']
chapter_title = f'\n***** Chapter: {title} *****\n'
with open('novel.txt', 'a') as f:
f.write(chapter_title)
f.write(content + '\n')
print(f"Chapter: '{title}' written to file.")
get_data(next_id, pages-1)
curr_id = '324909'
get_data(curr_id, 3)
Chapter: 'Heavenly Wolf Valley' written to file.
Chapter: 'Leaving' written to file.
Chapter: 'Pure Fabrication' written to file.

getting data from certian website via python

I'm trying to get the Chinese Yuan exchange rate to USD from the Bank of China website, however I'm lost in trying to get the data out of the website, here is my code:
from lxml import etree
import requests
s = "https://www.boc.cn/sourcedb/whpj/enindex_1619.html"
page = requests.get(s)
tree = etree.HTML(page.text)
element = tree.xpath('./body/table[1]/tbody/tr/td[1]/table[1]/tbody/tr/td/table/tbody/tr[26]/td')
content = etree.tostring(element[0])
the element variable is where the USD line in that page after I inspected the page
when I run the program I get this message:
File "d:\Python\tebotCopy.py", line 8, in
content = etree.tostring(element[0]) IndexError: list index out of range

You can try different XPath to get row with USD:
import requests
from lxml import etree
s = "https://www.boc.cn/sourcedb/whpj/enindex_1619.html"
page = requests.get(s)
tree = etree.HTML(page.text)
element = tree.xpath('//table[#bgcolor="#EAEAEA"]/tr[27]/td')
for td in element:
print(td.text, end=" ")
print()
Prints:
USD 641.51 636.29 644.23 644.23 646.12 2021.10.14 07:23:51

Scrape all images of off a multiple pages site?

I need to scrape all the images of the pages of the url given in the code but i could only do it manually each page till the last page(100th page).
This is the code for scraping each page and i replace the page number each time and run the code!
Down below
Is there any way to add a variable function and running a loop till it gets an error in this case a 404 page (since no more pages would be left)?
from bs4 import*
import requests as rq
r2 = rq.get("https://www.gettyimages.in/photos/aishwarya-rai?family=editorial&page=1&phrase=aishwarya%20rai&sort=mostpopular")
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
x = soup2.select('img[src^="https://media.gettyimages.com/photos/"]') #the frame where it shows the images
for img in x:
links.append(img['src'])
for index, img_link in enumerate(links):
img_data = rq.get(img_link).content
with open("aishwarya_rai/"+str(index+2)+'.jpg', 'wb+') as f:
f.write(img_data)
else:
f.close()
The page ranges from 1 to 100.
I need some additional code which makes the "page value" a variable and loops till 100

Use format() function and pass the page variable.
from bs4 import*
import requests as rq
url="https://www.gettyimages.in/photos/aishwarya-rai?family=editorial&page={}&phrase=aishwarya%20rai&sort=mostpopular"
links = []
for page in range(1,101):
print(url.format(page))
r2 = rq.get(url.format(page))
soup2 = BeautifulSoup(r2.text, "html.parser")
x = soup2.select('img[src^="https://media.gettyimages.com/photos/"]')
for img in x:
links.append(img['src'])
print(links)

Scraping multiple pages with Python Beautifulsoup -- only returning data from last page

I am trying to loop through multiple pages to scrape data with Python and Beautifulsoup. My script works for one page, but when trying to iterate through multiple pages, it only returns the data from the last page scraped. I think there may be something wrong in the way I am looping or storing/appending the player_data list.
Here is what I have thus far -- any help is much appreciated.
#! python3
# downloadRecruits.py - Downloads espn college basketball recruiting database info
import requests, os, bs4, csv
import pandas as pd
# Starting url (class of 2007)
base_url = 'http://www.espn.com/college-sports/basketball/recruiting/databaseresults/_/class/2007/page/'
# Number of pages to scrape (Not inclusive, so number + 1)
pages = map(str, range(1,3))
# url for starting page
url = base_url + pages[0]
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
# Creating bs object
soup = bs4.BeautifulSoup(res.text, "html.parser")
table = soup.find('table')
# Get the data
data_rows = soup.findAll('tr')[1:]
player_data = []
for tr in data_rows:
tdata = []
for td in tr:
tdata.append(td.getText())
if td.div and td.div['class'][0] == 'school-logo':
tdata.append(td.div.a['href'])
player_data.append(tdata)
print(player_data)

You should have your player_data list definition outside your loop, otherwise only the last iteration's results will be stored.

This is an indentation issue or a declaration issue, depending on the results you expect.
If you need to print the result for each page:
You can solve this by adding 4 spaces before print(player_data).
If you let the print statement outside the for loop block, it will be executed only once, after the loop has ended. So the only values it can display are the last values of player_data leaking from the last iteration of the for loop.
if you want to store all results in player_data and print it at the end :
you must declare player_data outside and before your for loop.
player_data = []
for n in pages:
# [...]

import requests
from bs4 import BeautifulSoup
# Starting url (class of 2007)
base_url = 'http://www.espn.com/college-sports/basketball/recruiting/databaseresults/_/class/2007/page/'
# Number of pages to scrape (Not inclusive, so number + 1)
pages = list(map(str,range(1,3)))
# In Python 3, map returns an iterable object of type map, and not a subscriptible list, which would allow you to write map[i]. To force a list result, write
# url for starting page
url = base_url + pages[0]
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
# Creating bs object
soup = BeautifulSoup(res.text, "html.parser")
table = soup.find('table')
# Get the data
data_rows = soup.findAll('tr')[1:]
player_data = []
for tr in data_rows:
tdata = []
for td in tr:
tdata.append(td.getText())
if td.div and td.div['class'][0] == 'school-logo':
tdata.append(td.div.a['href'])
player_data.append(tdata)
print(player_data)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping data through paginated table using python - python

Related

How to fix this web scraping program?

Stuck Scraping with Beautifulsoup

getting data from certian website via python

Scrape all images of off a multiple pages site?

Scraping multiple pages with Python Beautifulsoup -- only returning data from last page

Categories

Resources