dynamic values web scraping - python

Hello guys I've been trying to web scrape some pages that contain values that change all the time, but I'm not able to get the prices so far. Can anybody help me, this is where I reached so far!
import requests
import bs4
from urllib.request import Request, urlopen as uReq
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
my_url = 'https://www.cryptocompare.com/'
binary = FirefoxBinary('C:/Program Files/Mozilla Firefox/firefox.exe')
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:/Users/Genti/AppData/Local/Programs/Python/Python36-32/Lib/site-packages/selenium/geckodriver.exe")
browser = webdriver.Firefox(firefox_binary=binary)
browser.get(my_url)
html = browser.execute_script("return document.documentElement.outerHTML")
sel_soup = soup(html, 'html.parser')
prices = sel_soup.findAll("td", {"class":"price"})
print(prices)

You can try below code to get currency names, prices
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
my_url = 'https://www.cryptocompare.com/'
binary = FirefoxBinary('C:/Program Files/Mozilla Firefox/firefox.exe')
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:/Users/Genti/AppData/Local/Programs/Python/Python36-32/Lib/site-packages/selenium/geckodriver.exe")
driver.get(my_url)
names = [name.text.split('\n')[0] for name in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'desktop-name')))]
prices = [price.text for price in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'current-price-value')))]
print(datetime.datetime.now())
for name, price in zip(names, prices):
print(name + " - " + price)

In case, if you want all the 10 prices. you'd have to store all the prices in a list, like this :
all_prices = driver.find_elements_by_css_selector("td[class='price'] div")
then just iterate through a loop to get the values :
for price in all_prices:
print(price.text)
let me know, if you are facing any difficulties.

If you want to use BS and not Selenium Webdriver:
prices = sel_soup.select("td[class^='price'] > div")

Related

can't change the date with Selenium and Python

I need get data from website: https://www.eex.com/en/market-data/natural-gas/spot
But Changing the date in my script not working
I need get data from every available date, so i need change the date with Selenium
PLS Help. I'm new in Python
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium_stealth import stealth
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url ="https://www.eex.com/en/market-data/natural-gas/spot"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
browser = webdriver.Chrome(executable_path="chromedriver1/chromedriver", options=chrome_options)
browser.get("https://www.eex.com/en/market-data/natural-gas/spot")
time.sleep(10)
date_picker = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="symbolheader_ngs"]/div/div/div/input')))
date_picker.send_keys("2023-01-23")
time.sleep(20)
page_source = browser.page_source
s = bs(page_source)
table = s.select('table')[1]
final_list = []
for row in table.select('tr'):
final_list.append([x.text for x in row.find_all(['td', 'th'])])
final_df = pd.DataFrame(final_list[2:], columns = final_list[:1])
final_df.columns = ['Spot', 'Last Price', 'Last Volume', 'End of Day Index', 'Volume Exchange','del']
df=final_df.drop('del',axis=1)
browser.quit()
df.to_excel('final_df.xlsx', index = False)
You need to clear the input -> enter date -> push enter. You also want to wait for the clickability not the visibility of the element. Lastly you need to pick a date that has actual data.
from selenium.webdriver.common.keys import Keys
date_picker = WebDriverWait(browser, 30).until(
EC.element_to_be_clickable((By.XPATH, """//*[#id="symbolheader_ngs"]//*/input"""))
)
date_picker.clear()
date_picker.send_keys("2023-01-20")
date_picker.send_keys(Keys.ENTER)
If you comment out chrome_options.add_argument("--headless") you will see.

Data are overwritten in DataFrame

Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you.these is the page link https://www.askgamblers.com/online-casinos/countries/uk/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
product=[]
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item+ .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')
All result in 1 file :
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
product=[]
for page in range(1,4):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item+ .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')
In first loop its running only 2 times :
Change it to 1,4 as below then it will give you [1,2,3]:
for page in range(1,4):
Then data getting overwritten because output file name is same:
change file name as below:
df.to_csv(f'casino_{page}.csv')

No data table with Python3 Selenium

I need to improve this script to extract daily data from this site. However, I am not getting any data except for the "Spot" column!
Thanks for the help!
UPD. Now i can't change the date(
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium_stealth import stealth
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url ="https://www.eex.com/en/market-data/natural-gas/spot"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
browser = webdriver.Chrome(executable_path="chromedriver1/chromedriver", options=chrome_options)
browser.get("https://www.eex.com/en/market-data/natural-gas/spot")
time.sleep(10)
date_picker = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="symbolheader_ngs"]/div/div/div/input')))
date_picker.send_keys("2023-01-23")
time.sleep(20)
page_source = browser.page_source
s = bs(page_source)
table = s.select('table')[1]
final_list = []
for row in table.select('tr'):
final_list.append([x.text for x in row.find_all(['td', 'th'])])
final_df = pd.DataFrame(final_list[2:], columns = final_list[:1])
final_df.columns = ['Spot', 'Last Price', 'Last Volume', 'End of Day Index', 'Volume Exchange','del']
df=final_df.drop('del',axis=1)
browser.quit()
df.to_excel('final_df.xlsx', index = False)
little tweaks so that all columns can be extracted. main idea is that extract logic need to be checked with how HTML dom is.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import pandas as pd
def get_df(page_source):
soup = bs(page_source, 'html.parser')
table = soup.select('table')[1]
table_header=table.find("tr", {"class": "mv-quote-header-row"})
table_body=table.select('tbody')
result={}
for e_header in table_header.find_all('th'):
if e_header.text:
result[e_header.text]=[]
for e_r in table_body[0].find_all('tr'):
r1=[e.text for e in e_r.find_all('td',{'class':not ['mv-quote-button']})]
result['Spot'].append(r1[0])
result['Last Price'].append(r1[1])
result['Last Volume'].append(r1[2])
result['End of Day Index'].append(r1[3])
result['Volume Exchange'].append(r1[4])
#result
df=pd.DataFrame(result)
return df
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
#chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
#webdriver_service = Service()
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get("https://www.eex.com/en/market-data/natural-gas/spot")
#soup = BeautifulSoup(browser.page_source, 'html5lib')
page_source=browser.page_source
#table = soup.select('table')[1]
final_df=get_df(browser.page_source)
browser.quit()
final_df.to_excel('final_df.xlsx', index = False)

Adding new rows to a pandas df on loop

I'm curious how to append or concat a pandas df with new data coming from a looped interation. I'm using selenium to view the web pages and BeautifulSoup to read the HTML. From there, I get a two tables of data per page. I am running this over multiple pages and I want to add the data from table 1 on page 2 to the table 1 on page 1, and the same for table 2 on both pages.
I think I need an append function on the df, but I am not exactly sure.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/10"]
datalist_races = [] #empty list
x = 0 #counter
datalist_results = [] #empty list
x = 0 #counter
for url in urls:
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df_races = pd.read_html(str(race_soup))[0]
datalist_races.append(df_races[0])
df_results = pd.read_html(str(results_soup))[0]
datalist_results.append(df_results[0])
print(df_results)
driver.close()
Any insight would be wonderful. Reading through the comments and posts here, as well as watching YT videos, have left me no further ahead.
In your loop do this to any df you want to append:
df.loc[len(df.index)] = data_element
so for your case
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/10"]
datalist_races = [] #empty list
x = 0 #counter
datalist_results = [] #empty list
x = 0 #counter
for url in urls:
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df_races = pd.read_html(str(race_soup))[0]
datalist_races.loc[len(datalist_races.index)] = df_races.loc[0]
df_results = pd.read_html(str(results_soup))[0]
datalist_results.loc[len(datalist_results.index)] = df_results.loc[0]
print(df_results)
driver.close()

Selenium isn't waiting to click button before printing results

I'm trying to scrape results from Google Flights, using BeautifulSoup, after hitting a button that shows the full list of flights. I added an explicit wait, but for some reason this isn't working.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome(executable_path="/Users/Shiva/Downloads/chromedriver")
driver.get('https://www.google.com/flights?hl=en#flt=/m/03v_5.IAD.2019-02-10*IAD./m/03v_5.2019-02-11;c:USD;e:1;sd:1;t:f')
xpath = '//*[#id="flt-app"]/div[2]/main[4]/div[7]/div[1]/div[3]/div[4]/div[5]/div[1]/div[3]/jsl/a[1]/span[1]/span[2]'
wait = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,xpath)))
load_all_flights = driver.find_element_by_xpath(xpath)
load_all_flights.click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
info = soup.find_all('div', class_="gws-flights-results__collapsed-itinerary gws-flights-results__itinerary")
for trip in info:
price = trip.find('div', class_="flt-subhead1 gws-flights-results__price gws-flights-results__cheapest-price")
if price == None:
price = trip.find('div', class_="flt-subhead1 gws-flights-results__price")
type_of_flight = trip.find('div', class_="gws-flights-results__stops flt-subhead1Normal gws-flights-results__has-warning-icon")
if type_of_flight == None:
type_of_flight = trip.find('div', class_="gws-flights-results__stops flt-subhead1Normal")
print(str(type_of_flight.text).strip() + " : " + str(price.text).strip())

Categories

Resources