Adding new rows to a pandas df on loop - python

I'm curious how to append or concat a pandas df with new data coming from a looped interation. I'm using selenium to view the web pages and BeautifulSoup to read the HTML. From there, I get a two tables of data per page. I am running this over multiple pages and I want to add the data from table 1 on page 2 to the table 1 on page 1, and the same for table 2 on both pages.
I think I need an append function on the df, but I am not exactly sure.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/10"]
datalist_races = [] #empty list
x = 0 #counter
datalist_results = [] #empty list
x = 0 #counter
for url in urls:
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df_races = pd.read_html(str(race_soup))[0]
datalist_races.append(df_races[0])
df_results = pd.read_html(str(results_soup))[0]
datalist_results.append(df_results[0])
print(df_results)
driver.close()
Any insight would be wonderful. Reading through the comments and posts here, as well as watching YT videos, have left me no further ahead.

In your loop do this to any df you want to append:
df.loc[len(df.index)] = data_element
so for your case
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/10"]
datalist_races = [] #empty list
x = 0 #counter
datalist_results = [] #empty list
x = 0 #counter
for url in urls:
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df_races = pd.read_html(str(race_soup))[0]
datalist_races.loc[len(datalist_races.index)] = df_races.loc[0]
df_results = pd.read_html(str(results_soup))[0]
datalist_results.loc[len(datalist_results.index)] = df_results.loc[0]
print(df_results)
driver.close()

Related

No data table with Python3 Selenium

I need to improve this script to extract daily data from this site. However, I am not getting any data except for the "Spot" column!
Thanks for the help!
UPD. Now i can't change the date(
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium_stealth import stealth
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url ="https://www.eex.com/en/market-data/natural-gas/spot"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
browser = webdriver.Chrome(executable_path="chromedriver1/chromedriver", options=chrome_options)
browser.get("https://www.eex.com/en/market-data/natural-gas/spot")
time.sleep(10)
date_picker = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="symbolheader_ngs"]/div/div/div/input')))
date_picker.send_keys("2023-01-23")
time.sleep(20)
page_source = browser.page_source
s = bs(page_source)
table = s.select('table')[1]
final_list = []
for row in table.select('tr'):
final_list.append([x.text for x in row.find_all(['td', 'th'])])
final_df = pd.DataFrame(final_list[2:], columns = final_list[:1])
final_df.columns = ['Spot', 'Last Price', 'Last Volume', 'End of Day Index', 'Volume Exchange','del']
df=final_df.drop('del',axis=1)
browser.quit()
df.to_excel('final_df.xlsx', index = False)
little tweaks so that all columns can be extracted. main idea is that extract logic need to be checked with how HTML dom is.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import pandas as pd
def get_df(page_source):
soup = bs(page_source, 'html.parser')
table = soup.select('table')[1]
table_header=table.find("tr", {"class": "mv-quote-header-row"})
table_body=table.select('tbody')
result={}
for e_header in table_header.find_all('th'):
if e_header.text:
result[e_header.text]=[]
for e_r in table_body[0].find_all('tr'):
r1=[e.text for e in e_r.find_all('td',{'class':not ['mv-quote-button']})]
result['Spot'].append(r1[0])
result['Last Price'].append(r1[1])
result['Last Volume'].append(r1[2])
result['End of Day Index'].append(r1[3])
result['Volume Exchange'].append(r1[4])
#result
df=pd.DataFrame(result)
return df
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
#chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
#webdriver_service = Service()
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get("https://www.eex.com/en/market-data/natural-gas/spot")
#soup = BeautifulSoup(browser.page_source, 'html5lib')
page_source=browser.page_source
#table = soup.select('table')[1]
final_df=get_df(browser.page_source)
browser.quit()
final_df.to_excel('final_df.xlsx', index = False)

Python selenium web scraped data to csv export

So i am working on a custom web scraper for any kind of ecommerce site, i want it to scrape names and prices of listings on a site and then export them to csv, but the problem is it exports only one line of (name, price) and it prints it on every line of csv, i couldnt find a good solution for this, i hope im not asking an extremely stupid thing, although i think the fix is easy. I hope someone will read my code and help me, thank you !
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import pandas as pd
#driver path
driver = webdriver.Firefox(executable_path="D:\Programy\geckoDriver\geckodriver.exe")
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]"))
)
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
print(nadpist)
##find the price and print
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
print(cenat)
##export to csv - not working
time.sleep(1)
print("Writing to csv")
d = {"Nazov": [nadpist]*20*x,"Cena": [cenat]*20*x}
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
i want the csv to look like:
name,price
name2, price2
it would be great is the csv had only two columns and x rows depending on the number of listings
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = []
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
d.append({"Nazov": vypis.find_element_by_class_name("nadpis").text,
"Cena": vypis.find_element_by_class_name("cena").text
})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv",index=False)
this gives me 59 items with price. first added to dict then to list, then send that to pandas.
All you need to do is create two empty lists nadpist_l, cenat_l and append data to that lists, finally save the lists as a dataframe.
UPDATED as per the comment
Check if this works
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = {}
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
nadpist_l = []
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
nadpist_l.append(nadpist)
# print(nadpist)
##find the price and print
cenat_l = []
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
cenat_l.append(cenat)
print(len(cenat_l))
##export to csv - not working
d.update({"Nazov": [nadpist_l] * 20 * x, "Cena": [cenat_l] * 20 * x})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")

How to extract all dynamic table data from a website using selenium?

I am new to web scraping. I am trying to extract table data from the Forbes Top Multinational Performers list. I was able to successfully extract some data. However, I only was able to get the top 10 from the list. The table contains ads in between. How can I get all the data?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 30)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH,
'.//*[#id="the_list"]/tbody[#id="list-table-body"]')))
data = []
for row in rows:
for i in row.find_elements_by_class_name("data"):
try:
if i.is_displayed():
row_dict = {}
row_dict['Rank'] = i.find_element_by_xpath('.//td[2]').text
row_dict['Link'] = i.find_element_by_xpath('.//td[3]/a[#href]').get_attribute("href")
row_dict['Company'] = i.find_element_by_xpath('.//td[3]').text
row_dict['Industry'] = i.find_element_by_xpath('.//td[4]').text
row_dict['Country'] = i.find_element_by_xpath('.//td[5]').text
data.append(row_dict)
except:
continue
driver.close()
df = pd.DataFrame(data)
df.to_csv("Forbes_TEST.csv", sep=",", index=False)
To get all 250 records you just need to add code to scroll to the bottom of the page to your existing code. So add:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
before:
data = []
and add import time
But saying that your code is really slow. Even with setting your wait_row to 3 it took 1m5.933s to run on my machine. The following code took 0m12.978s to run.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import csv
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 3)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH, './/*[#id="the_list"]/tbody[#id="list-table-body"]')))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
ranks = []
links = []
companies = []
industries = []
countries = []
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.find("table", {"id": "the_list"})
for tr in table.find_all("tr", {"class": "data"}):
tds = tr.find_all("td")
ranks.append(tds[1].text)
links.append(tds[2].find('a')['href'])
companies.append(tds[2].text)
industries.append(tds[3].text)
countries.append(tds[4].text)
data = zip(ranks, links, companies, industries, countries)
with open('Forbes_TEST_02.csv', 'w') as csvfile:
csv_out = csv.writer(csvfile)
csv_out.writerow(['Rank', 'Link', 'Company','Industry', 'Country'])
csv_out.writerows(data)
driver.close()

Selenium isn't waiting to click button before printing results

I'm trying to scrape results from Google Flights, using BeautifulSoup, after hitting a button that shows the full list of flights. I added an explicit wait, but for some reason this isn't working.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome(executable_path="/Users/Shiva/Downloads/chromedriver")
driver.get('https://www.google.com/flights?hl=en#flt=/m/03v_5.IAD.2019-02-10*IAD./m/03v_5.2019-02-11;c:USD;e:1;sd:1;t:f')
xpath = '//*[#id="flt-app"]/div[2]/main[4]/div[7]/div[1]/div[3]/div[4]/div[5]/div[1]/div[3]/jsl/a[1]/span[1]/span[2]'
wait = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,xpath)))
load_all_flights = driver.find_element_by_xpath(xpath)
load_all_flights.click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
info = soup.find_all('div', class_="gws-flights-results__collapsed-itinerary gws-flights-results__itinerary")
for trip in info:
price = trip.find('div', class_="flt-subhead1 gws-flights-results__price gws-flights-results__cheapest-price")
if price == None:
price = trip.find('div', class_="flt-subhead1 gws-flights-results__price")
type_of_flight = trip.find('div', class_="gws-flights-results__stops flt-subhead1Normal gws-flights-results__has-warning-icon")
if type_of_flight == None:
type_of_flight = trip.find('div', class_="gws-flights-results__stops flt-subhead1Normal")
print(str(type_of_flight.text).strip() + " : " + str(price.text).strip())

dynamic values web scraping

Hello guys I've been trying to web scrape some pages that contain values that change all the time, but I'm not able to get the prices so far. Can anybody help me, this is where I reached so far!
import requests
import bs4
from urllib.request import Request, urlopen as uReq
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
my_url = 'https://www.cryptocompare.com/'
binary = FirefoxBinary('C:/Program Files/Mozilla Firefox/firefox.exe')
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:/Users/Genti/AppData/Local/Programs/Python/Python36-32/Lib/site-packages/selenium/geckodriver.exe")
browser = webdriver.Firefox(firefox_binary=binary)
browser.get(my_url)
html = browser.execute_script("return document.documentElement.outerHTML")
sel_soup = soup(html, 'html.parser')
prices = sel_soup.findAll("td", {"class":"price"})
print(prices)
You can try below code to get currency names, prices
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
my_url = 'https://www.cryptocompare.com/'
binary = FirefoxBinary('C:/Program Files/Mozilla Firefox/firefox.exe')
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:/Users/Genti/AppData/Local/Programs/Python/Python36-32/Lib/site-packages/selenium/geckodriver.exe")
driver.get(my_url)
names = [name.text.split('\n')[0] for name in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'desktop-name')))]
prices = [price.text for price in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'current-price-value')))]
print(datetime.datetime.now())
for name, price in zip(names, prices):
print(name + " - " + price)
In case, if you want all the 10 prices. you'd have to store all the prices in a list, like this :
all_prices = driver.find_elements_by_css_selector("td[class='price'] div")
then just iterate through a loop to get the values :
for price in all_prices:
print(price.text)
let me know, if you are facing any difficulties.
If you want to use BS and not Selenium Webdriver:
prices = sel_soup.select("td[class^='price'] > div")

Categories

Resources