getting data from certian website via python

getting data from certian website via python - python

I'm trying to get the Chinese Yuan exchange rate to USD from the Bank of China website, however I'm lost in trying to get the data out of the website, here is my code:
from lxml import etree
import requests
s = "https://www.boc.cn/sourcedb/whpj/enindex_1619.html"
page = requests.get(s)
tree = etree.HTML(page.text)
element = tree.xpath('./body/table[1]/tbody/tr/td[1]/table[1]/tbody/tr/td/table/tbody/tr[26]/td')
content = etree.tostring(element[0])
the element variable is where the USD line in that page after I inspected the page
when I run the program I get this message:
File "d:\Python\tebotCopy.py", line 8, in
content = etree.tostring(element[0]) IndexError: list index out of range

You can try different XPath to get row with USD:
import requests
from lxml import etree
s = "https://www.boc.cn/sourcedb/whpj/enindex_1619.html"
page = requests.get(s)
tree = etree.HTML(page.text)
element = tree.xpath('//table[#bgcolor="#EAEAEA"]/tr[27]/td')
for td in element:
print(td.text, end=" ")
print()
Prints:
USD 641.51 636.29 644.23 644.23 646.12 2021.10.14 07:23:51

Related

How to fix this web scraping program?

Im a complete beginner with python, and I wrote this program to scrape and find closing odds of NHL games off The Score website, and put that data in a file. The program runs but for some reason only 2 games out of the about 200 I tried show up with incorrect data.
I think it is because of how I am search through divs within a div, I wrote the code that returns the data from that in a way that it only stores that last div (which conveniently is the div im looking to scrape).
Also im sure my way of writing to the file is poor for runtime, is there a better way to do this?
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(response.content, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
# Code to find div and search for div within
divs = soup.find('div',{'class': 'col-sm-4'})
for tag in divs:
# find div
target = tag.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
# find divs within target div
odds = tag.find("div", {"class","GameDetailsCard__content--2L_KF"})
# Call write_to_file -> add data scraped from web
write_to_file(matchtitle.text +" "+ odds.text)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
file = open("NHL_GAMES.txt","a")
file.write(game_data +"\n")
file.close
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)
Here is one line in the text file with correct data:
Toronto Maple Leafs # New Jersey Devils on November 24, 2022 NJD -140, o/u 6.5
Here is one with incorrect data
Carolina Hurricanes # Anaheim Ducks on December 7, 2022 Justin St. Pierre, Chris Lee
Only 2/215 were wrong like this.

It looks like that certain NHL game webpages ex: Carolina does not contain a <div> section for the 'Odds', this might be due to then being OT games? Regardless best bet is to add in a clause to handle 'no odds found'. I have updated some of your code below:
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
results = []
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
target = soup.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
if "Odds" in str(tag.find("div", {"class":"GameDetailsCard__label--iBMhJ"})):
odds = str(tag.find("div", {"class":"GameDetailsCard__content--2L_KF"}).text)
else:
odds = "No Odds found!"
print(matchtitle.text + " " + odds)
results.append(matchtitle.text + " " + odds)
# Call write_to_file -> add data scraped from web
write_to_file(results)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
print("Getting game details...")
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
with open("NHL_GAMES.txt", "a") as file:
for line in game_data:
file.write(line + "\n")
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)

web scraping For loop stuck on first item on list- python

I'm trying to loop through a list of names, it seems it should be very simple but it's not working. Only receiving the first name on the list and nothing more! why is it not moving on to the NEXT name!?
Here's my code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('http://www.aus.edu/info/200170/college_of_architecture_art_and_design/269/department_of_architecture/4')
soup = BeautifulSoup (page.content, "html.parser")
content = soup.find(class_="supContact")
content_items = content.find_all(class_="contactToggle selected")
names = content_items[0]
s_name = []
for item in name:
s_name.append(content.find(class_="contactToggle selected").text)
if not names:
continue
print(s_name)
#print(names.prettify())

Following links in Python

I have to write a program that will read the HTML from this link(http://python-data.dr-chuck.net/known_by_Maira.html), extract the href= values from the anchor tags, scan for a tag that is in a particular position relative to the first name in the list, follow that link and repeat the process a number of times and report the last name you find.
I am supposed to find the link at position 18 (the first name is 1), follow that link and repeat this process 7 times. The answer is the last name that I retrieve.
Here is the code I found and it works just fine.
import urllib
from BeautifulSoup import *
url = raw_input("Enter URL: ")
count = int(raw_input("Enter count: "))
position = int(raw_input("Enter position: "))
names = []
while count > 0:
print "retrieving: {0}".format(url)
page = urllib.urlopen(url)
soup = BeautifulSoup(page)
tag = soup('a')
name = tag[position-1].string
names.append(name)
url = tag[position-1]['href']
count -= 1
print names[-1]
I would really appreciate if someone could explain to me like you would to a 10 year old, what's going on inside the while loop. I am new to Python and would really appreciate the guidance.

while count > 0: # because of `count -= 1` below,
# will run loop count times
print "retrieving: {0}".format(url) # just prints out the next web page
# you are going to get
page = urllib.urlopen(url) # urls reference web pages (well,
# many types of web content but
# we'll stick with web pages)
soup = BeautifulSoup(page) # web pages are frequently written
# in html which can be messy. this
# package "unmessifies" it
tag = soup('a') # in html you can highlight text and
# reference other web pages with <a>
# tags. this get all of the <a> tags
# in a list
name = tag[position-1].string # This gets the <a> tag at position-1
# and then gets its text value
names.append(name) # this puts that value in your own
# list.
url = tag[position-1]['href'] # html tags can have attributes. On
# and <a> tag, the href="something"
# attribute references another web
# page. You store it in `url` so that
# its the page you grab on the next
# iteration of the loop.
count -= 1

You enter the number of urls you want to retrieve from a page
0) prints url
1) opens url
2) reads source
BeautifulSoup docs
3) gets every a tag
4) gets the whole <a ...></a> I think
5) adds it to a list names
6) gets url from the last item of names, ie pulls href from <a ...></a>
7) prints the last of the list names

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
total=0
url = input('Enter - ')
c=input('enter count-')
count=int(c)
p=input('enter position-')
pos=int(p)
while total<=count:
html = urllib.request.urlopen(url, context=ctx).read()
print("Retrieving",url)
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
counter=0
for tag in tags:
counter=counter+1
if(counter<=pos):
x=tag.get('href',None)
url=x
else:
break
total=total+1

Solution with explanations.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
url = input('Enter - ')
count = int(input('Enter count: '))
position = int(input ('Enter position: '))
names = []
while count > 0:
print('Retrieving: {}'.format(url))
html = urllib.request.urlopen(url) # open the url using urllib
soup = BeautifulSoup(html, 'html.parser')# parse html data in a clean format
# Retrieve all of the anchor tags
tags = soup('a')
# This gets the <a> tag at position-1 and then gets its text value
name = tags[position-1].string
names.append(name) #add the name to our list
url = tags[position-1]['href']#retrieve the url for next iteratopn
count -= 1
print(names)
print('Answer: ',names[count-1])
Hope it helps.

BeautifulSoup returning unrelated HTML

I'm trying to parse basketball stat data from pages like http://www.sports-reference.com/cbb/boxscores/2014-11-14-kentucky.html. I'm using Python 2.7.6 and BeautifulSoup 4-4.3.2. I'm searching gamelogs like the above page for the class "sortable" in order to get access to the raw stat data contained within the tables. I am only interested in the "Basic Stats" for each team.
However, the HTML that BeautifulSoup is returning is not at all what I expect. Instead I get a list of all-time team records and data for every school that has ever played. I don't have enough reputation to post a second link here of the output or I would.
Basically, there are four class "sortable" tables on the boxscore page. When I ask BS to find them by the only way I can think of to distinguish them from the other data, it instead returns completely irrelevant data and I can't even figure out where the returned data comes from.
Here's the code:
import urllib2
import re
import sys
from bs4 import BeautifulSoup
class Gamelogs():
def __init__(self):
#the base bage that has all boxscore links
self.teamPageSoup = BeautifulSoup(urllib2.urlopen(
'http://www.sports-reference.com/cbb/schools/' + school +
'/2015-gamelogs.html'))
#use regex to only find links with score data
self.statusPageLinks = self.teamPageSoup.findAll(href=re.compile(
"boxscores"));
def scoredata(links, school):
#for each link in the school's season
for l in links:
gameSoup = BeautifulSoup(urllib2.urlopen(l))
#remove extra link formatting to get just filename alone
l = l[59+len(school):]
#open a local file with that filename to store the results
fo = open(str(l),"w")
#create a list that will hold the box score data only
output = gameSoup.findAll(class_="sortable")
#write it line by line to the file that was just opened
for o in output:
fo.write(str(o) + '\n')
fo.close
def getlinks(school):
gamelogs = Gamelogs()
#open a new file to store the output
fo = open(school + '.txt',"w")
#remove extraneous links
gamelogs.statusPageLinks = gamelogs.statusPageLinks[2:]
#create the list that will hold each school's seasonlong boxscores
boxlinks = list()
for s in gamelogs.statusPageLinks:
#make the list element a string so it can be sliced
string = str(s)
#remove extra link formatting
string = string[9:]
string = string[:-16]
#create the full list of games per school
boxlinks.insert(0, 'http://www.sports-reference.com/cbb/schools/'
+ school + string)
scoredata(boxlinks, school)
if __name__ == '__main__':
#for each school as a commandline argument
for arg in sys.argv[1:]:
school = arg
getlinks(school)
Is this a problem with BS, my code, or the site? T

It looks like this is an issue with your code. The page that you are getting back sounds like this one: http://www.sports-reference.com/cbb/schools/?redir
Whenever I enter an invalid school name I am redirected to a page showing stats for 477 different teams. FYI: team names in the url are also case sensitive.

Scraping data through paginated table using python

I am scraping data through google finance's historical page for a stock (http://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=PLfUVIDTDuSRiQKhwYGQBQ).
I can scrape the 30 rows on the current page. The issue I am facing is that I am unable to scrape through the rest of data in the table (31-241 rows). How do I go to the next page or link.
Following is my code:
import urllib2
import xlwt #to write into excel spreadsheet
from bs4 import BeautifulSoup
# Main Coding Section
stock_links = open('stock_link_list.txt', 'r') #opening text file for reading
#url="https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=zHXOVLPnApG2iALxxYCADQ"
for url in stock_links:
OurFile = urllib2.urlopen(url)
OurHtml = OurFile.read()
OurFile.close()
soup = BeautifulSoup(OurHtml)
#soup1 = soup.find("div", {"class": "gf-table-wrapper sfe-break-bottom-16"}).get_text()
soup1 = soup.find("table", {"class": "gf-table historical_price"}).get_text()
end = url.index('&')
filename = url[47:end]
file = open(filename, 'w') #opening text file for writing
file.write(soup1)
#file.write(soup1.get_text()) #writing to the text file
file.close() #closing the text file

You will have to fine tune it and I would catch more specific errors but you can keep increasing the start to get the next data:
url = "https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=W8LUVLHnAoOswAOFs4DACg&start={}&num=30"
from bs4 import BeautifulSoup
import requests
# Main Coding Sectio
start = 0
while True:
try:
nxt = url.format(start)
r = requests.get(nxt)
soup = BeautifulSoup(r.content)
print(soup.find("table",{"class": "gf-table historical_price"}).get_text())
except Exception as e:
print(e)
break
start += 30
This gets all the table data up to the last date feb 7 :
......
Date
Open
High
Low
Close
Volume
Feb 7, 2014
552.60
557.90
548.25
551.50
119,711

Looking at first sight the Row Limit option allows to shows maximum 30 row per page but I manually changed query string parameters to greater numbers and realizes we can view max 200 rows per page
Change URL to
https://www.google.com/finance/historical?q=NSE%3ASIEMENS&ei=OM3UVLFtkLnzBsjIgYAI&start=0&num=200
It will show 200 rows
And then change start=200&num=400
But more logically, if you have many other sunch kind of links.
Then you can scrape the Pagination area, the last TR and grab those links of next pages and scrape

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

getting data from certian website via python - python

Related

How to fix this web scraping program?

web scraping For loop stuck on first item on list- python

Following links in Python

BeautifulSoup returning unrelated HTML

Scraping data through paginated table using python

Categories

Resources