How to fix this web scraping program?

How to fix this web scraping program? - python

Im a complete beginner with python, and I wrote this program to scrape and find closing odds of NHL games off The Score website, and put that data in a file. The program runs but for some reason only 2 games out of the about 200 I tried show up with incorrect data.
I think it is because of how I am search through divs within a div, I wrote the code that returns the data from that in a way that it only stores that last div (which conveniently is the div im looking to scrape).
Also im sure my way of writing to the file is poor for runtime, is there a better way to do this?
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(response.content, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
# Code to find div and search for div within
divs = soup.find('div',{'class': 'col-sm-4'})
for tag in divs:
# find div
target = tag.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
# find divs within target div
odds = tag.find("div", {"class","GameDetailsCard__content--2L_KF"})
# Call write_to_file -> add data scraped from web
write_to_file(matchtitle.text +" "+ odds.text)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
file = open("NHL_GAMES.txt","a")
file.write(game_data +"\n")
file.close
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)
Here is one line in the text file with correct data:
Toronto Maple Leafs # New Jersey Devils on November 24, 2022 NJD -140, o/u 6.5
Here is one with incorrect data
Carolina Hurricanes # Anaheim Ducks on December 7, 2022 Justin St. Pierre, Chris Lee
Only 2/215 were wrong like this.

It looks like that certain NHL game webpages ex: Carolina does not contain a <div> section for the 'Odds', this might be due to then being OT games? Regardless best bet is to add in a clause to handle 'no odds found'. I have updated some of your code below:
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
results = []
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
target = soup.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
if "Odds" in str(tag.find("div", {"class":"GameDetailsCard__label--iBMhJ"})):
odds = str(tag.find("div", {"class":"GameDetailsCard__content--2L_KF"}).text)
else:
odds = "No Odds found!"
print(matchtitle.text + " " + odds)
results.append(matchtitle.text + " " + odds)
# Call write_to_file -> add data scraped from web
write_to_file(results)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
print("Getting game details...")
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
with open("NHL_GAMES.txt", "a") as file:
for line in game_data:
file.write(line + "\n")
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)

Related

Stuck Scraping with Beautifulsoup

So i'm trying to scrape a html webpage. It has novel chapters and i'm trying to get the text and store in text files to read offline. I don't have any previous experience with html or other things either. So the webpage I am trying to scrape is this. And the code i've been testing so far looks like this
`
import sys
import requests
import time
import re
from bs4 import BeautifulSoup
def browse_and_scrape(seed_url, page_number=1):
# Fetch the URL - We will be using this to append to images and info routes
url_pat = re.compile(r"(http://.*\.org)")
source_url = url_pat.search(seed_url).group(0)
# Page_number from the argument gets formatted in the URL & Fetched
formatted_url = seed_url.format(str(page_number))
# print(url_pat,source_url,formatted_url)
try:
html_text = requests.get(formatted_url).text
# print(html_text)
# Prepare the soup
soup = BeautifulSoup(html_text, "html.parser")
print(soup.find_all(id="chapterContent")[0]["style"])
print(f"Now Scraping - {formatted_url}")
# help = soup.find_all("div",class_="chapter-content text-normal")[0].text.strip().encode("ascii", "ignore").decode("ascii")
# for node in soup.findAll("div",class_="chapter-content text-normal"):
# print(node)
# print(''.join(node.findAll(text=True)))
# for node in soup.findAll("div"):
# # print(node)
# print(''.join(node.findAll(text=True)))
# help = soup.find_all("div",class_="chapter-content text-normal")[0]
# print(''.join(help.findAll(text=True)))
# print(help)
except Exception as e:
return e
return true
if __name__ == "__main__":
# seed_url = "http://books.toscrape.com/catalogue/page-{}.html"
seed_url = "http://wnmtl.org/chapter/324909-heavenly-wolf-valley.html"
# seed_url = "http://wnmtl.org/chapter/{}.html"
print("Web scraping has begun")
result = browse_and_scrape(seed_url)
if result == True:
print("Web scraping is now complete!")
else:
print(f"Oops, That doesn't seem right!!! - {result}")`
All the commented stuff are things i've been trying to rip the text from the tag. From my inspection of the developer console in the browser, all the text is in the tag with id of chapter content. My plan is to iteratively get the text, stuff it, get the link for the next page and repeat but i've been stuck for a bit now, any suggestions.

Instead of scraping each page, you can directly get the text from this API endpoint using requests.
https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/324909
The last item in the above API is the chapter ID (324909). You can navigate to chapters by giving in the chapter IDs.
The next and prev chapter IDs are present in the current chapter's API endpoint. Have a look at the above URL in browser to understand it better.
Here is the full recursive code that writes the text from 3 pages to a file called novel.txt. You may change the number of pages and other details as per your need.
import requests
def get_data(chapter_id, pages):
if pages == 0:
return
url = 'https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/' + str(chapter_id)
r = requests.get(url)
x = r.json()
pre_id = x['data']['preId']
next_id = x['data']['nextId']
title = x['data']['title']
content = x['data']['content']
chapter_title = f'\n***** Chapter: {title} *****\n'
with open('novel.txt', 'a') as f:
f.write(chapter_title)
f.write(content + '\n')
print(f"Chapter: '{title}' written to file.")
get_data(next_id, pages-1)
curr_id = '324909'
get_data(curr_id, 3)
Chapter: 'Heavenly Wolf Valley' written to file.
Chapter: 'Leaving' written to file.
Chapter: 'Pure Fabrication' written to file.

python , run def() multiple times

i wrote this code to get Ebay prices
it asks for full ebay link then it writes its price
import bs4 , requests
print('please enter full Ebay link ..')
link = str(input())
def ebayprice(url):
res = requests.get(link)
res.raise_for_status()
txt = bs4.BeautifulSoup(res.text , 'html.parser')
csselement = txt.select('#mm-saleDscPrc')
return csselement[0].text.strip()
price = ebayprice(link)
print('price is : '+ price)
i want to improve it and i tried my best and i couldnt
i want it to take multiple links and run them one by one and it should write results each time
it doesnt matter if links are from input() or from links = 'www1,www2,www3'

you can split by comma and iterate over the list using a for loop:
def ebayprice(url):
...
for single_link in link.split(','):
price = ebayprice(single_link)
print('price for {} is {}'.format(single_link, price))

if you want you can ask for how many links, someone want to scrape, and after that you can use for loop statment to go through every url
import bs4 , requests
# ask how many links he will pass
print('How many links do you want wo scrape ?')
link_numb = int(input())
# get the links
print('please enter full Ebay link ..')
links = [input() for _ in range(link_numb)]
def ebayprice(link):
res = requests.get(link)
res.raise_for_status()
txt = bs4.BeautifulSoup(res.text , 'html.parser')
csselement = txt.select('#mm-saleDscPrc')
return csselement[0].text.strip()
for link in links:
price = ebayprice(link)
print(price)
Example:
How many links do you want wo scrape ?
2
please enter full Ebay link ..
http://example.com
http://example-just-test.com
# simple print the url
http://example.com
http://example-just-test.com

Python Link Scraper

focus_Search = raw_input("Focus Search ")
url = "https://www.google.com/search?q="
res = requests.get(url + focus_Search)
print("You Just Searched")
res_String = res.text
#Now I must get ALL the sections of code that start with "<a href" and end with "/a>"
Im trying to scrape all the links from a google search webpage. I could extract each link one at a time but I'm sure theres a better way to do it.

This creates a list of all links in the search page with some of your code, without getting into BeautifulSoup
import requests
import lxml.html
focus_Search = input("Focus Search ")
url = "https://www.google.com/search?q="
#focus_Search
res = requests.get(url + focus_Search).content
# res
dom = lxml.html.fromstring(res)
links = [x for x in dom.xpath('//a/#href')] # Borrows from cheekybastard in link below
# http://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
links

BeautifulSoup returning unrelated HTML

I'm trying to parse basketball stat data from pages like http://www.sports-reference.com/cbb/boxscores/2014-11-14-kentucky.html. I'm using Python 2.7.6 and BeautifulSoup 4-4.3.2. I'm searching gamelogs like the above page for the class "sortable" in order to get access to the raw stat data contained within the tables. I am only interested in the "Basic Stats" for each team.
However, the HTML that BeautifulSoup is returning is not at all what I expect. Instead I get a list of all-time team records and data for every school that has ever played. I don't have enough reputation to post a second link here of the output or I would.
Basically, there are four class "sortable" tables on the boxscore page. When I ask BS to find them by the only way I can think of to distinguish them from the other data, it instead returns completely irrelevant data and I can't even figure out where the returned data comes from.
Here's the code:
import urllib2
import re
import sys
from bs4 import BeautifulSoup
class Gamelogs():
def __init__(self):
#the base bage that has all boxscore links
self.teamPageSoup = BeautifulSoup(urllib2.urlopen(
'http://www.sports-reference.com/cbb/schools/' + school +
'/2015-gamelogs.html'))
#use regex to only find links with score data
self.statusPageLinks = self.teamPageSoup.findAll(href=re.compile(
"boxscores"));
def scoredata(links, school):
#for each link in the school's season
for l in links:
gameSoup = BeautifulSoup(urllib2.urlopen(l))
#remove extra link formatting to get just filename alone
l = l[59+len(school):]
#open a local file with that filename to store the results
fo = open(str(l),"w")
#create a list that will hold the box score data only
output = gameSoup.findAll(class_="sortable")
#write it line by line to the file that was just opened
for o in output:
fo.write(str(o) + '\n')
fo.close
def getlinks(school):
gamelogs = Gamelogs()
#open a new file to store the output
fo = open(school + '.txt',"w")
#remove extraneous links
gamelogs.statusPageLinks = gamelogs.statusPageLinks[2:]
#create the list that will hold each school's seasonlong boxscores
boxlinks = list()
for s in gamelogs.statusPageLinks:
#make the list element a string so it can be sliced
string = str(s)
#remove extra link formatting
string = string[9:]
string = string[:-16]
#create the full list of games per school
boxlinks.insert(0, 'http://www.sports-reference.com/cbb/schools/'
+ school + string)
scoredata(boxlinks, school)
if __name__ == '__main__':
#for each school as a commandline argument
for arg in sys.argv[1:]:
school = arg
getlinks(school)
Is this a problem with BS, my code, or the site? T

It looks like this is an issue with your code. The page that you are getting back sounds like this one: http://www.sports-reference.com/cbb/schools/?redir
Whenever I enter an invalid school name I am redirected to a page showing stats for 477 different teams. FYI: team names in the url are also case sensitive.

Scraping data through paginated table using python

I am scraping data through google finance's historical page for a stock (http://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=PLfUVIDTDuSRiQKhwYGQBQ).
I can scrape the 30 rows on the current page. The issue I am facing is that I am unable to scrape through the rest of data in the table (31-241 rows). How do I go to the next page or link.
Following is my code:
import urllib2
import xlwt #to write into excel spreadsheet
from bs4 import BeautifulSoup
# Main Coding Section
stock_links = open('stock_link_list.txt', 'r') #opening text file for reading
#url="https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=zHXOVLPnApG2iALxxYCADQ"
for url in stock_links:
OurFile = urllib2.urlopen(url)
OurHtml = OurFile.read()
OurFile.close()
soup = BeautifulSoup(OurHtml)
#soup1 = soup.find("div", {"class": "gf-table-wrapper sfe-break-bottom-16"}).get_text()
soup1 = soup.find("table", {"class": "gf-table historical_price"}).get_text()
end = url.index('&')
filename = url[47:end]
file = open(filename, 'w') #opening text file for writing
file.write(soup1)
#file.write(soup1.get_text()) #writing to the text file
file.close() #closing the text file

You will have to fine tune it and I would catch more specific errors but you can keep increasing the start to get the next data:
url = "https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=W8LUVLHnAoOswAOFs4DACg&start={}&num=30"
from bs4 import BeautifulSoup
import requests
# Main Coding Sectio
start = 0
while True:
try:
nxt = url.format(start)
r = requests.get(nxt)
soup = BeautifulSoup(r.content)
print(soup.find("table",{"class": "gf-table historical_price"}).get_text())
except Exception as e:
print(e)
break
start += 30
This gets all the table data up to the last date feb 7 :
......
Date
Open
High
Low
Close
Volume
Feb 7, 2014
552.60
557.90
548.25
551.50
119,711

Looking at first sight the Row Limit option allows to shows maximum 30 row per page but I manually changed query string parameters to greater numbers and realizes we can view max 200 rows per page
Change URL to
https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=OM3UVLFtkLnzBsjIgYAI&start=0&num=200
It will show 200 rows
And then change start=200&num=400
But more logically, if you have many other sunch kind of links.
Then you can scrape the Pagination area, the last TR and grab those links of next pages and scrape

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to fix this web scraping program? - python

Related

Stuck Scraping with Beautifulsoup

python , run def() multiple times

Python Link Scraper

BeautifulSoup returning unrelated HTML

Scraping data through paginated table using python

Categories

Resources