Extract data by looping though dates using pandas - python

I want to scrape exchange rate data from July 1 2021 to June 30 2022 by enumerating exchangeDate variable and save it to excel.
Here is my code so far:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Set the URL for the website you want to scrape
url = "https://www.bot.go.tz/ExchangeRate/previous_rates?__RequestVerificationToken=P0qGKEy8P6ISFMLlu7mKvMi4YrMyeHc1aCz4ZuGQVyJ6mK9w6StV6QPyinF7ym_mAZG6yO6ShU1DuFm6teqBAxCcCrEQSjz7KtXzi2kbJH41&exchangeDate=04%2F05%2F2022"
# Send an HTTP request to the website and retrieve the HTML content
response = requests.get(url)
html = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Find the table containing the data you want to scrape
table = soup.find("table", attrs={"class": "table"})
# Extract the data from the table and save it to a Pandas DataFrame
df = pd.read_html(str(table))[0]
# Save the DataFrame to an Excel file
df.to_excel("exchange_Rate_data.xlsx", index=False)
How do I loop through all dates?

You can use something like this:
import requests
from bs4 import BeautifulSoup
import pandas as pd
start='2021-07-01'
end='2022-06-30'
dates=[i.replace('-','%2F') for i in pd.date_range(start,end,freq='d').strftime('%m-%d-%Y').tolist()]
final_df=pd.DataFrame()
for i in dates:
# Set the URL for the website you want to scrape
url = "https://www.bot.go.tz/ExchangeRate/previous_rates?__RequestVerificationToken=P0qGKEy8P6ISFMLlu7mKvMi4YrMyeHc1aCz4ZuGQVyJ6mK9w6StV6QPyinF7ym_mAZG6yO6ShU1DuFm6teqBAxCcCrEQSjz7KtXzi2kbJH41&exchangeDate={}".format(i)
# Send an HTTP request to the website and retrieve the HTML content
response = requests.get(url)
html = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Find the table containing the data you want to scrape
table = soup.find("table", attrs={"class": "table"})
# Extract the data from the table and save it to a Pandas DataFrame
df = pd.read_html(str(table))[0]
final_df=pd.concat([final_df,df])
final_df.to_excel("exchange_Rate_data.xlsx", index=False)

Related

How to getdata from webpage table

On the website https://www.shanghairanking.com/rankings/arwu/2020
the URL doesn't change when I hit "next". Any ideas on how to scrape the tables on the next pages. Using bs4 in Python, I am able to only scrape the table on the first page.
What I did so far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
html_text = requests.get('https://www.shanghairanking.com/rankings/arwu/2020').text
soup = BeautifulSoup(html_text,'lxml')
data = soup.find('table', class_= "rk-table").text.replace(' ','')
print(data)

Geting Rating data from Vivino page with python

I am scraping the ranking data from Vivino.
The website is: link
but I got an empty data list can anyone help me. Image of the result i got Image
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.vivino.com/explore?e=eJzLLbI11jNVy83MszVXy02ssDU2UEuutPXzUUu2dQ0NUiuwNVRLT7MtSyzKTC1JzFHLT7ItSizJzEsvjk8sSy1KTE9Vy7dNSS1OVisviY4FKgZTRhDKGMozgdDmEMoEAJ7xJhY%3D" #the url of the website we're scraping
page = requests.get(url)
html = BeautifulSoup(page.content, "html.parser")
Name_html = html.find_all(class_="item_description")
Rating_html = html.find_all(class_="vivinoRating__averageValue--3Navj")
Name_list = []
Rating_list = []
#loop over HTML elements, get text and add to list
for i in Name_html:
Name_list.append(i)
for i in Rating_html:
Rating_list.append(i)
#make a dictionary with column names for data to put in DataFrame
data = {"description": Name_list,
"airdate": Rating_list}
df = pd.DataFrame(data) #make a dataframe
The wines labels are loaded through Javascript, you can't scrape them with bs4 as if they're part of a static html.
You need to either manually send the POST requests to get the response content, or scrape the page dinamically with selenium through any of the available browser simulation drivers (i.e. chromedriver).

How do i change this code to scrape a table

I am struggling to data scrape this website:
https://wix-visual-data.appspot.com/app/widget?pageId=cu7nt&compId=comp-kesofw00&viewerCompId=comp-kesofw00&siteRevision=947&viewMode=site&deviceType=desktop&locale=en&tz=Europe%2FLondon&width=980&height=890&instance=k983l1LiiUeOz5_3Pd_CLXbjfadc08q1fEu54xfh9aA.eyJpbnN0YW5jZUlkIjoiYjQ0MWIxMGUtNTRmNy00YzdhLTgwY2QtNmU0ZjkwYzljMzA3IiwiYXBwRGVmSWQiOiIxMzQxMzlmMy1mMmEwLTJjMmMtNjkzYy1lZDIyMTY1Y2ZkODQiLCJtZXRhU2l0ZUlkIjoiM2M3ZmE5OWItY2I3Yy00MTg0LTk1OTEtNWY0MDhmYWYwZmRhIiwic2lnbkRhdGUiOiIyMDIxLTAxLTMwVDAxOjIzOjAyLjU1MVoiLCJ1aWQiOiIzYWMyNDI3YS04NGVhLTQ0ZGUtYjYxMS02MTNiZTVlOWJiZGQiLCJkZW1vTW9kZSI6ZmFsc2UsImFpZCI6IjczYWE3ZWNjLTQyODUtNDY2My1iNjMxLTMzMjE0MWJiZDhhMiIsImJpVG9rZW4iOiI4ODNlMTg5NS05ZjhiLTBkZmUtMTU1Yy0zMTBmMWY2NmNjZGQiLCJzaXRlT3duZXJJZCI6ImVhYWU1MDEzLTMxZjgtNDQzNC04MDFhLTE3NDQ2N2EwZjE5YSIsImV4cGlyYXRpb25EYXRlIjoiMjAyMS0wMS0zMFQwNToyMzowMi41NTFaIiwiaGFzVXNlclJvbGUiOmZhbHNlfQ&currency=GBP&currentCurrency=GBP&vsi=795183b4-8f30-4854-bd85-77678dbe4cf8&consent-policy=%7B%22func%22%3A0%2C%22anl%22%3A0%2C%22adv%22%3A0%2C%22dt3%22%3A1%2C%22ess%22%3A1%7D&commonConfig=%7B%22brand%22%3A%22wix%22%2C%22bsi%22%3Anull%2C%22BSI%22%3Anull%7D
This URL has a table but for some reason I am not able to scrape this into an excel file. This is my current code in Python and this is what I have tried. Any help is much appreciated thank you legends!
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://wix-visual-data.appspot.com/app/widget?pageId=cu7nt&compId=comp-kesofw00&viewerCompId=comp-kesofw00&siteRevision=947&viewMode=site&deviceType=desktop&locale=en&tz=Europe%2FLondon&width=980&height=890&instance=dxGyx3zK9ULK0A8UtGOrLw-__FTD9EBEfzQojJ7Bz00.eyJpbnN0YW5jZUlkIjoiYjQ0MWIxMGUtNTRmNy00YzdhLTgwY2QtNmU0ZjkwYzljMzA3IiwiYXBwRGVmSWQiOiIxMzQxMzlmMy1mMmEwLTJjMmMtNjkzYy1lZDIyMTY1Y2ZkODQiLCJtZXRhU2l0ZUlkIjoiM2M3ZmE5OWItY2I3Yy00MTg0LTk1OTEtNWY0MDhmYWYwZmRhIiwic2lnbkRhdGUiOiIyMDIxLTAxLTI5VDE4OjM0OjQwLjgwM1oiLCJ1aWQiOiIzYWMyNDI3YS04NGVhLTQ0ZGUtYjYxMS02MTNiZTVlOWJiZGQiLCJkZW1vTW9kZSI6ZmFsc2UsImFpZCI6IjczYWE3ZWNjLTQyODUtNDY2My1iNjMxLTMzMjE0MWJiZDhhMiIsImJpVG9rZW4iOiI4ODNlMTg5NS05ZjhiLTBkZmUtMTU1Yy0zMTBmMWY2NmNjZGQiLCJzaXRlT3duZXJJZCI6ImVhYWU1MDEzLTMxZjgtNDQzNC04MDFhLTE3NDQ2N2EwZjE5YSIsImV4cGlyYXRpb25EYXRlIjoiMjAyMS0wMS0yOVQyMjozNDo0MC44MDNaIiwiaGFzVXNlclJvbGUiOmZhbHNlfQ&currency=GBP&currentCurrency=GBP&vsi=57130cda-8191-488e-8089-f472928266e3&consent-policy=%7B%22func%22%3A0%2C%22anl%22%3A0%2C%22adv%22%3A0%2C%22dt3%22%3A1%2C%22ess%22%3A1%7D&commonConfig=%7B%22brand%22%3A%22wix%22%2C%22bsi%22%3Anull%2C%22BSI%22%3Anull%7D"
table_id = "theTable"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table', attrs={"id" : theTable})
df = pd.read_html(str(table))
The page is loading the data using JavaScript. You can find the URL using the Network tab of Firefox. Even better news, the data is in the CSV format so you don't even need an HTML parser to parse it.
You can find the CSV here.

scraping data from Chicago Mercantile Exchange website

I am trying to scrape data from table of CME website. Specifically I want to pull the data of Open interest for every future currency. but when I try parse the table it gives me nothing.
Link from which I am trying to scrape the data given below is the code through which I am trying to do it.
from bs4 import BeautifulSoup
import requests
url="https://www.cmegroup.com/market-data/volume-open-interest/fx-volume.html"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# Parse the html content
soup = BeautifulSoup(html_content)
table = soup.find("table", attrs={"class": "cmeData voiDataset"})
print(table)
Table data comes from another HTML doc that you can get with
from bs4 import BeautifulSoup
import requests
url = 'https://www.cmegroup.com/CmeWS/mvc/xsltTransformer.do?xlstDoc=/XSLT/md/voi/voi_asset_class_final.xsl&url=/da/VOI/V2/Totals/TradeDate/20201116/AssetClassId/3/ReportType/F?excluded=CEE,CEU,KCB&hidelinks=false&html='
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# Parse the html content
soup = BeautifulSoup(html_content)
table = soup.find("table", attrs={"class": "cmeData voiDataset"})
print(table)
To get data for specific date you can change URL as below
# Date for 2020 November 12
date = '20201112'
url = 'https://www.cmegroup.com/CmeWS/mvc/xsltTransformer.do?xlstDoc=/XSLT/md/voi/voi_asset_class_final.xsl&url=/da/VOI/V2/Totals/TradeDate/{}/AssetClassId/3/ReportType/F?excluded=CEE,CEU,KCB&hidelinks=false&html='.format(date)

BeautifulSoup not working after the first page

I'm trying to use Python's BeautifulSoup to scrape data from the following website. The data on the website is split over four different pages. Each page has a unique link (i.e. http://insider.espn.com/nbadraft/results/top100/_/year/2019/set/0 for the first page, http://insider.espn.com/nbadraft/results/top100/_/year/2019/set/1 for the second page, etc.). I am able to successfully scrape the data on the first page, but when I try to scrape data for the second page onward it comes up empty. Here is the code I'm using:
# Import libraries
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd
# Define url and request webpage
season = 2019
page = 1
url = "http://insider.espn.com/nbadraft/results/top100/_/year/{}/set/{}".format(season, page)
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
# Scrape all of the data in the table
rows = page_soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
# Get the column headers
headers = player_stats[0]
# Remove the first row
player_stats.pop(0)
# Convert to pandas dataframe
df = pd.DataFrame(player_stats, columns = headers)
# Remove all rows where Name = None
df = df[~df['NAME'].isnull()]
# Remove PLAYER column because it's empty
df = df.drop(columns='PLAYER')
df
Any advice would be much appreciated! I'm a bit new to using BeautifulSoup, so I apologize in advance if the code isn't particularly nice or efficient.
Update: The links only work if opened in Chrome, which is likely what is causing the problem. Is there any way around it?

Categories

Resources