I'm trying to scrape the next day's forecast for time, wind speed and wind direction from Weather Underground. I adapted the code in this tutorial and my MWE is
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import date, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
# define future date
start_date = date.today() + pd.Timedelta(days=1)
# get data for Sydney
page = 'https://www.wunderground.com/hourly/au/sydney/date/{}-{}-{}'
df = pd.DataFrame()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.set_capability("loggingPrefs", {'performance': 'ALL'})
service = ChromeService(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)
classlist = ["mat-cell cdk-cell cdk-column-timeHour mat-column-timeHour ng-star-inserted",
"mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",
"mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",]
name = ['time', 'windspeed_mph', 'winddirection']
print('gathering data from: ', start_date)
formatted_lookup_URL = page.format(start_date.year,
start_date.month,
start_date.day)
driver.get(formatted_lookup_URL)
rows = WebDriverWait(driver, 20).until( \
EC.visibility_of_all_elements_located((By.XPATH, \
'//td[#class="' + classlist[0] + '"]')))
for row in rows:
time = row.find_element(By.XPATH,'.//span[#class="ng-star-inserted"]').text
# append new row to table
df = df.append(pd.DataFrame({"Day":[str(start_date)],"time":[time],}),
ignore_index = True)
del classlist[0]
for ii in range(len(classlist)):
rows = WebDriverWait(driver, 20).until( \
EC.visibility_of_all_elements_located((By.XPATH, \
'//td[#class="' + classlist[ii] + '"]')))
for row in rows:
if name[ii]=='winddirection':
data = row.find_element(By.XPATH,
'.//span[#class="wu-suffix ng-star-inserted"]').text
print(data)
else:
data = row.find_element(By.XPATH,
'.//span[#class="wu-value wu-value-to"]').text
# append new row to table
df = df.append(pd.DataFrame({name[ii]:[data]}), ignore_index=True)
driver.quit()
# remove NaN
df = df.apply(lambda x: pd.Series(x.dropna().values))
print(df.head())
The final dataframe df contains the time and wind speed, but not the wind direction. I suspect it's because of the line data = row.find_element(By.XPATH, './/span[#class="wu-suffix ng-star-inserted"]').text but I'm not sure how to fix it.
Seems that the classlist and name have different lengths after line del classlist[0]. Fix it by adding this line after deleting first element of classlist:
del name[0]
Related
I'm trying to update each row with the title of each link in the loop, however, only one value is being populated in the worksheet. I would like something like a list:
Women's Walking Shoes Sock Sneakers
Smart Watch for MenWomen
....
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import gspread
import time
import datetime
driver = uc.Chrome(use_subprocess=True)
asins = ['B07QJ839WH', 'B00GP258C2']
for asin in asins:
r = driver.get(f'https://www.amazon.com/dp/{asin}')
time.sleep(20)
try:
title = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//span[#id="productTitle"]')))
titleValue = title.get_attribute("innerText")
except:
titleValue = ('Title not found')
asin = asin
date = datetime.datetime.today()
print(titleValue, asin, date)
gc = gspread.service_account(filename='creds.json')
sh = gc.open('Luciana-2022')
oos = sh.worksheet("OOS")
cell_list = oos.range('A2:A')
cell_values = [titleValue]
for i, val in enumerate(cell_values):
cell_list[i].value = val
oos.update_cells(cell_list)
Hi your issue is located here: for i, val in enumerate(cell_values):
The variable cell_values contains only 1 value, so when you use iterate it only runs the loop once.
If you wish go over the complete list of cells and update their values simply do:
cell_list = oos.range('A2:A')
for i in len(cell_list):
cell_list[i].value = titleValue
oos.update_cells(cell_list)
I am trying to use selenium in Python to take information from a city's public website and loop through information I have in a csv with each row being a different address, date and city. Ideally I would download the associated PDF, but I am getting stuck on how to actively loop through the csv using pandas. I pasted the code I have so far!
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
df = pd.read_csv(r'C:\\Users\\xxx\\Desktop\\uof_ex.csv')
driver = webdriver.Chrome('C:\\Users\\xxx\\Desktop\\chromedriver_win32\\chromedriver.exe')
driver.get('https://p2c.highpointnc.gov/EventSearch')
wait = WebDriverWait(driver, 20)
wait_implicit = driver.implicitly_wait(5)
action = ActionChains(driver)
pop_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="disclaimerDialog"]/md-dialog-actions/button[2]'))).click()
i = 0
while i == 0:
a = 0
address = df.address
city = df.City
date = df.date_occu
search_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="byReportInformation-card"]/md-card-title/md-card-title-text/span[1]'))).click()
time.sleep(2)
address_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="address-input"]')))
address_element.click()
address_element.clear()
address_element.send_keys(address[a])
time.sleep(2)
city_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="city-select"]'))).click()
city_element_choose = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="select_option_24"]'))).click()
time.sleep(2)
stdate_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_6"]'))).click()
stdate_element_clear = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_6"]'))).clear()
time.sleep(2)
enddate_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_8"]'))).click()
enddate_element_clear = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_8"]'))).clear()
time.sleep(2)
stdate_element_choose = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_6"]'))).send_keys(date[a])
enddate_element_choose = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_8"]'))).send_keys(date[a])
search_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="search-button"]'))).click()
time.sleep(2)
back_element = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="back-button"]'))).click()
time.sleep(2)
a = a + 1
you can loop over the rows of the pandas csv object
as named tuples
df = pd.read_csv(r'C:\\Users\\xxx\\Desktop\\uof_ex.csv')
for row in df.itertuples():
address = row.address
city = row.City
date = row.date_occu
now address contains the data from the currently iterated row. so you can use address directly instead of address[i].
or as dictionaries
for index, row in df.iterrows():
address = row['address']
city = row['City']
date = row['date_occu']
read here for more options https://www.dataindependent.com/pandas/pandas-iterate-over-rows/
This is the code that I am working on and I want to the code to iterate through scraping and dates in such a way that it does not overwrite the csv file and the new data is added after scraping through next set of dates. Please help me with it.
I tired different methods such as f.write()
and append as well for the same code but it gives me some or other error.
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from functools import reduce
import pandas as pd
import time
from datetime import datetime, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
import requests
def render_page(url,type):
driver = webdriver.Chrome(executable_path=r'C:\Users\abc\Desktop\screen scraping codes\chromedriver.exe')
driver.get(url)
time.sleep(15)
if type =="C":
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'wuSettings'))
)
element.click()
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[#id="wuSettings-quick"]/div/a[2]')))
element.click()
time.sleep(15)
r = driver.page_source
driver.quit()
if type=="F":
r = driver.page_source
driver.quit()
return r
def hourly_scraper(page,dates,type):
output = pd.DataFrame()
for d in dates:
url = str(str(page) + str(d))
r = render_page(url,type)
soup = BS(r, "html.parser",)
container = soup.find('lib-city-history-observation')
check = container.find('tbody')
data = []
data_hour = []
for i in check.find_all('span', class_='ng-star-inserted'):
trial = i.get_text()
data_hour.append(trial)
for i in check.find_all('span', class_='wu-value wu-value-to'):
trial = i.get_text()
data.append(trial)
numbers = pd.DataFrame([data[i:i+7] for i in range(0, len(data), 7)],columns=["Temperature","Dew Point","Humidity","Wind Speed","Wind Gust","Pressure","Precipitation"])
hour = pd.DataFrame(data_hour[0::17],columns=["Time"])
wind = pd.DataFrame(data_hour[7::17],columns=["Wind"])
condition = pd.DataFrame(data_hour[16::17],columns=["Condition"])
dfs = [hour,numbers,wind,condition]
df_final = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), dfs)
df_final['Date'] = str(d)
output = output.append(df_final)
df_final.to_csv(r'C:\Users\abc\Desktop\screen scraping codes\123.csv',index=False)
print(str(str(d) + ' finished!'))
return output
page = "https://www.wunderground.com/history/daily/in/ahmedabad/VAAH/date/"
#dates = ["2020-12-27","2020-12-28"]
d0 = datetime(2009, 1,1)
d1 = datetime(2009, 1,3)
dt = timedelta(days = 1)
dates = np.arange(d0, d1, dt).astype(datetime)
hourly = hourly_scraper(page,dates,"C")
So i am working on a custom web scraper for any kind of ecommerce site, i want it to scrape names and prices of listings on a site and then export them to csv, but the problem is it exports only one line of (name, price) and it prints it on every line of csv, i couldnt find a good solution for this, i hope im not asking an extremely stupid thing, although i think the fix is easy. I hope someone will read my code and help me, thank you !
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import pandas as pd
#driver path
driver = webdriver.Firefox(executable_path="D:\Programy\geckoDriver\geckodriver.exe")
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]"))
)
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
print(nadpist)
##find the price and print
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
print(cenat)
##export to csv - not working
time.sleep(1)
print("Writing to csv")
d = {"Nazov": [nadpist]*20*x,"Cena": [cenat]*20*x}
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
i want the csv to look like:
name,price
name2, price2
it would be great is the csv had only two columns and x rows depending on the number of listings
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = []
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
d.append({"Nazov": vypis.find_element_by_class_name("nadpis").text,
"Cena": vypis.find_element_by_class_name("cena").text
})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv",index=False)
this gives me 59 items with price. first added to dict then to list, then send that to pandas.
All you need to do is create two empty lists nadpist_l, cenat_l and append data to that lists, finally save the lists as a dataframe.
UPDATED as per the comment
Check if this works
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = {}
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
nadpist_l = []
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
nadpist_l.append(nadpist)
# print(nadpist)
##find the price and print
cenat_l = []
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
cenat_l.append(cenat)
print(len(cenat_l))
##export to csv - not working
d.update({"Nazov": [nadpist_l] * 20 * x, "Cena": [cenat_l] * 20 * x})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
I am new to web scraping. I am trying to extract table data from the Forbes Top Multinational Performers list. I was able to successfully extract some data. However, I only was able to get the top 10 from the list. The table contains ads in between. How can I get all the data?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 30)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH,
'.//*[#id="the_list"]/tbody[#id="list-table-body"]')))
data = []
for row in rows:
for i in row.find_elements_by_class_name("data"):
try:
if i.is_displayed():
row_dict = {}
row_dict['Rank'] = i.find_element_by_xpath('.//td[2]').text
row_dict['Link'] = i.find_element_by_xpath('.//td[3]/a[#href]').get_attribute("href")
row_dict['Company'] = i.find_element_by_xpath('.//td[3]').text
row_dict['Industry'] = i.find_element_by_xpath('.//td[4]').text
row_dict['Country'] = i.find_element_by_xpath('.//td[5]').text
data.append(row_dict)
except:
continue
driver.close()
df = pd.DataFrame(data)
df.to_csv("Forbes_TEST.csv", sep=",", index=False)
To get all 250 records you just need to add code to scroll to the bottom of the page to your existing code. So add:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
before:
data = []
and add import time
But saying that your code is really slow. Even with setting your wait_row to 3 it took 1m5.933s to run on my machine. The following code took 0m12.978s to run.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import csv
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 3)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH, './/*[#id="the_list"]/tbody[#id="list-table-body"]')))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
ranks = []
links = []
companies = []
industries = []
countries = []
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.find("table", {"id": "the_list"})
for tr in table.find_all("tr", {"class": "data"}):
tds = tr.find_all("td")
ranks.append(tds[1].text)
links.append(tds[2].find('a')['href'])
companies.append(tds[2].text)
industries.append(tds[3].text)
countries.append(tds[4].text)
data = zip(ranks, links, companies, industries, countries)
with open('Forbes_TEST_02.csv', 'w') as csvfile:
csv_out = csv.writer(csvfile)
csv_out.writerow(['Rank', 'Link', 'Company','Industry', 'Country'])
csv_out.writerows(data)
driver.close()