unable to store scraped data into excel file - python

Hii guys I was scraping this data and my code is working fine but I am not able to understand how to store this scraped data into a excel file and I am getting more confused after looking at others solution and answers
here is my code
import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
import openpyxl
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
##### Web scraper for infinite scrolling page #####
driver = webdriver.Chrome(executable_path='./chromedriver.exe')
driver.get("https://www.zomato.com/ncr/south-delhi-restaurants/fast-food?rating_range=4.0-5.0&category=2")
time.sleep(10) # Allow 2 seconds for the web page to open
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
count=0
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
page_sources= BeautifulSoup(driver.page_source, "html.parser")
title=driver.find_elements_by_xpath("//a[#class='sc-jHZirH intUsQ']")
for i in title:
count += 1
name =i.find_element_by_xpath('./div/h4')
address = i.find_element_by_xpath('./p[1]')
#data = openpyxl.load_workbook('Bakery.xlsx')
df = pd.DataFrame({"Bakery Restaurants": name,"Address":address})
print(count)
driver.close()
this is the main block of code
page_sources= BeautifulSoup(driver.page_source, "html.parser")
title=driver.find_elements_by_xpath("//a[#class='sc-jHZirH intUsQ']")
for i in title:
count += 1
name =i.find_element_by_xpath('./div/h4')
address = i.find_element_by_xpath('./p[1]')
#data = openpyxl.load_workbook('Bakery.xlsx')
df = pd.DataFrame({"Bakery Restaurants": name,"Address":address})
print(count)
driver.close()
please help me out, I am too confused ,any suggestion/solution will be appreciated

Assuming your scraping is working correctly. You need to store name and address to the excel
# scraping code
#
#
names = [] # Create lists to hold your columns
addresses = [] # Create lists to hold your columns
for i in title:
count += 1
name =i.find_element_by_xpath('./div/h4')
names.append(name)
address = i.find_element_by_xpath('./p[1]')
addresses.append(address)
df = pd.DataFrame({'Names': names, 'Addresses': addresses}) # Create a DF with the lists
with pd.ExcelWriter('output.xlsx') as writer:
df.to_excel(writer, sheet_name='Sheet1')
Note: Make sure the datatype of the scraped values is string or a type supported by pandas and excel. And not a python object.
Docs: Insert into DataFrame
DataFrame to Excel

this is my attempt and working fine
names=[]
addresses=[]
for i in title:
count += 1
name =i.find_element_by_xpath('./div/h4').text
address = i.find_element_by_xpath('./p[1]').text
names.append(name)
addresses.append(address)
df = pd.DataFrame({'Names': names, 'Addresses': addresses})
writer = pd.ExcelWriter('pandas_simple.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()

Related

Want to scraping titles, dates, links, and content from IOL website but can't

I am new to web scraping, and I am trying to scrape the titles, dates, links, and contents of news articles on this website: https://www.iol.co.za/news/south-africa/eastern-cape.
The titles of the articles have different class names and heading (h) tag. I was able to scrape the dates, links, and titles using h tag. However, when I tried to store them in a pandas dataframe, I received the following errors-> ValueError: All arrays must be of the same length.
I also wrote the code to get the content of each article using the links. I got an error as well. I will thankful if I can be assisted.
I have tried different options to scrape the titles by creating a list of the different class names, but to no avail.
Please see my code below:
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from datetime import timedelta
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
art_title = [] # to store the titles of all news article
art_date = [] # to store the dates of all news article
art_link = [] # to store the links of all news article
pagesToGet = ['south-africa/eastern-cape']
for i in range(0, len(pagesToGet)):
print('processing page : \n')
url = 'https://www.iol.co.za' + str(pagesToGet[i])
print(url)
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
#time.sleep(5) # allow you to sleep your code before your retrieve the elements from the webpage. Additionally, to
# prevent the chrome driver opening a new instance for every url, open the browser outside of the loop.
# an exception might be thrown, so the code should be in a try-except block
try:
# use the browser to get the url. This is suspicious command that might blow up.
driver.get("https://www.iol.co.za/news/" +str(pagesToGet[i]))
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print('ERROR FOR LINK:', url) # print the link that cause the problem
print(error_type, 'Line:', error_info.tb_lineno) # print error info and line that threw the exception
continue # ignore this page. Abandon this and go back.
time.sleep(3) # Allow 3 seconds for the web page to open
# Code to scroll the screen to the end and click on more news till the 15th page before scraping all the news
k = 1
while k<=2:
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
driver.find_element(By.CSS_SELECTOR, '.Articles__MoreFromButton-sc-1mrfc98-0').click()
k += 1
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
news = soup.find_all('article', attrs={'class': 'sc-ifAKCX'})
print(len(news))
# Getting titles, dates, and links
for j in news:
# Article title
title = j.findAll(re.compile('^h[1-6]'))
for news_title in title:
art_title.append(news_title.text)
# Article dates
dates = j.find('p', attrs={'class': 'sc-cIShpX'})
if dates is not None:
date = dates.text
split_date = date.rsplit('|', 1)[1][10:].rsplit('<', 1)[0]
art_date.append(split_date)
# Article links
address = j.find('a').get('href')
news_link = 'https://www.iol.co.za' + address
art_link.append(news_link)
df = pd.DataFrame({'Article_Title': art_title, 'Date': art_date, 'Source': art_link})
# Getting contents
new_articles = ...struggling to write the code
df['Content'] = news_articles
df.to_csv('data.csv')
driver.quit()
I think this is what you are looking for:
# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
# For every article we take what we want
for article in articles:
header = article.find_element(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']")
print(header.get_attribute('textContent'))
author_and_date = article.find_elements(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']/following-sibling::p[1]")
if author_and_date:
print(author_and_date[0].get_attribute('textContent'))
else:
print("No author found")
link = article.find_element(By.XPATH, f".//a")
print(link.get_attribute('href'))

How to extract data in the right order with Beautiful Soup

I am trying to extract the balance sheet for an example ticker "MSFT" (Microsoft) from Yahoo Finance.
Using Selenium to click on the button "Expand All" before any scraping is done. This part seems to work.
By the way, when the Chrome web driver is launched, I manually click on the button(s) to accept or reject cookies. In a later step, I plan to add some more code so that this part is also automated. My question is though not on this one now.
Below is how the code currently looks like.
# for scraping the balance sheet from Yahoo Finance
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
#content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
# the following code returns an empty list for index (why?)
# and values in a list that need actually be in a DataFrame
idx = []
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
for h in div.find_all('title'):
text = h.get_text()
idx.append(text)
val = []
for div in soup.find_all('div', attrs={'data-test': 'fin-col'}):
for h in div.find_all('span'):
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
# if the above part is commented out and this block is used instead
# the following code manages to work well until the row "Cash Equivalents"
# that is because there are no entries for years 2020 and 2019 on this row
""" for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i % 5 == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = pd.concat([df, row], axis=0) """
return idx, val
get_balance_sheet_from_yfinance("MSFT")
I could not get the data scraped from the expanded table in a usable tabular format. Instead, the function above returns what I managed to scrape from the webpage. There are some additional comments in the code.
Could you give me some ideas on how to properly extract the data and put it into a DataFrame object with index which should be the text under the "Breakdown" column? Basically, the DataFrame should look like the snapshot below, with what is under the first column in there being the index.
balance-sheet-df
i've spent a long time on this, hope it helps, basically your function now returns a dataFrame with the following formatting:
2022-06-29 2021-06-29 2020-06-29 2019-06-29
Total Assets 364,840,000 333,779,000 301,311,000 286,556,000
Current Assets 169,684,000 184,406,000 181,915,000 175,552,000
Cash, Cash Equivalents & Short Term Investments 104,749,000 130,334,000 136,527,000 133,819,000
Cash And Cash Equivalents 13,931,000 14,224,000 13,576,000 11,356,000
Cash 8,258,000 7,272,000 - -
... ... ... ... ...
Tangible Book Value 87,720,000 84,477,000 67,915,000 52,554,000
Total Debt 61,270,000 67,775,000 70,998,000 78,366,000
Net Debt 35,850,000 43,922,000 49,751,000 60,822,000
Share Issued 7,464,000 7,519,000 7,571,000 7,643,000
Ordinary Shares Number 7,464,000 7,519,000 7,571,000 7,643,000
and here's the final code:
# for scraping the balance sheet from Yahoo Finance
from time import sleep
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
# content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
row = {}
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
head = div.find('span').get_text()
i = 4
for h in div.find_all('span'):
if h.get_text().replace(',', '').isdigit() or h.get_text()[0] == '-':
row[head].append(h.get_text())
i += 1
else:
while i < 4:
row[head].append('')
i += 1
else:
head = h.get_text()
row[head] = []
i = 0
for k, v in row.items():
while len(v) < 4:
row[k].append('-')
df = pd.DataFrame(columns=col, index=row.keys(), data=row.values())
print(df)
return df
get_balance_sheet_from_yfinance("MSFT")
i've removed some od the unused code and added a new scrapping method, but i have kept your method of getting the dates of all the columns.
if you have any questions don't hesitate to ask in the comments.

how to use driver.get(url) from extracted href list from the page?

I am wanting to go to https://www.bookmaker.com.au/sports/soccer, extract the soccer urls which it does. I am then wanting to go to each of those webpages through driver.get(url). I have done this as a list and then it extracts the data for each of those urls and place in pandas. I am stuck at getting driver.get(url) for each of those links extracted. Any help appreciated.
Css/href for driver.get(url):
url = #a[class *= 'matches-filter__region']
import time
import pandas as pd
import webdriver_manager.chrome
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
###########################################################################################################################################################
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_experimental_option("detach", True)
service = Service('driver/chromedriver.exe')
driver = webdriver.Chrome(service=Service(webdriver_manager.chrome.ChromeDriverManager().install()), options=options)
driver.get('https://www.bookmaker.com.au/sports/soccer')
aa = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")))
################################################################################################################
for url in aa:
aa = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
driver.get(aa)
##############################################################################
#Full Code https://pastebin.com/W0VqaKVD
After using VPN I could connect and I found few problems with code
you have to get aa before for-loop
find gives reference to objects in browser's memory but when you use get() then it removes these objects from memory to create new object from new page, and results from find() are useless. You have to use .get_attribute('href') to get urls as strings.
you have to use for-loop to run get() for every string from list and you have to run other code inside this loop. And after loop you have to create DataFrame
This is code without code inside loop. But at least it visits all urls.
# --- before loop ---
all_a = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
# get URLs as strings
all_urls = []
for item in all_a:
all_urls.append(item.get_attribute('href'))
# shorter
#all_urls = [item.get_attribute('href') for item in all_a]
team1List = []
backOddsList = []
team2List = []
layOddsList = []
# --- loop ---
# visit all pages and get teams
for url in all_urls:
print(url)
driver.get(url)
# here (inside loop) all code to get teams from active page
# --- after loop ---
df = pd.DataFrame({
'Team1': team1List,
'Back Odds': backOddsList,
'Team2': team2List,
'Lay Odds': layOddsList
})
df.to_excel('bookmaker.xlsx', engine='openpyxl', sheet_name='Sheet_name_1', index=False)
Updated the code, I checked, it is working, navigating to all the 15 urls in the same browser window:
diff_country_urls = []
for i in range(len(aa)):
diff_country_urls.append(aa[i].get_attribute("href"))
for url in diff_country_urls:
driver.get(url)

Scraping with bs4 and selenium, each loop returns the same data

I'm pretty new to web scraping and am trying to scrape backdated data from timeanddate.com and output it to a csv. I'm using Selenium to get the data table for each date.
My code:
from bs4 import BeautifulSoup
from selenium import webdriver
import csv
def getData (url, month, year):
driver = webdriver.Chrome('C:/Users/adam/Desktop/chromedriver.exe')
driver.get(url)
Data = []
soup = BeautifulSoup(driver.page_source, "lxml")
for i in driver.find_element_by_id("wt-his-select").find_elements_by_tag_name("option"):
i.click()
table = soup.find('table', attrs={'id':'wt-his'})
for tr in table.find('tbody').find_all('tr'):
dict = {}
dict['time'] = tr.find('th').text.strip()
all_td = tr.find_all('td')
dict['humidity'] = all_td[5].text
Data.append(dict)
fileName = "output_month="+month+"_year="+year+".csv"
keys = Data[0].keys()
with open(fileName, 'w') as result:
dictWriter = csv.DictWriter(result, keys)
dictWriter.writeheader()
dictWriter.writerows(Data)
year_num = int(input("Enter your year to collect data from: "))
month_num = 1
year = str(year_num)
for i in range (0,12):
month = str(month_num)
url = "https://www.timeanddate.com/weather/usa/new-york/historic?month="+month+"&year="+year
data = getData(url, month, year)
print (data)
month_num += 1
The table I'm trying to scrape data from is weather data and I want to get the humidity data from each day in the month.
The program cycles through the months but the output is the data for Mon, 1 Jan. Although the date changes in-browser, the same data is appended to the file each time (current output) rather than each new day being appended (desired output). I can't work out why it does this and any help fixing it would be much appreciated.
The problem is that you parse the website only once even though the site changes with each date selection. However, it is not enough to move parsing inside the for loop as it is also necessary to wait until the page is loaded before starting re-parsing.
Below is my solution. There are two things to note:
I am making use of the WebDriverWait and expected_conditions provided built-in with Selenium
I prefer finding by CSS selectors, which greatly simplifies syntax. This awesome game can help you learn them
# Necessary imports
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def getData (url, month, year):
driver = webdriver.Chrome('C:/Users/adam/Desktop/chromedriver.exe')
driver.get(url)
wait = WebDriverWait(driver, 5);
Data = []
for opt in driver.find_elements_by_css_selector("#wt-his-select option"):
opt.click()
# wait until the table title changes to selected date
wait.until(EC.text_to_be_present_in_element((By.ID, 'wt-his-title'), opt.text))
for tr in driver.find_elements_by_css_selector('#wt-his tbody tr'):
dict = {}
dict['time'] = tr.find_element_by_tag_name('th').text.strip()
# Note that I replaced 5 with 6 as nth-of-xxx starts indexing from 1
dict['humidity'] = tr.find_element_by_tag_name('td:nth-of-type(6)').text.strip()
Data.append(dict)
# continue with csv handlers ...

Extract data from site and different pages and save to csv

I am working with selenium webdriver to search and extract data from this site https://www.idealista.com/venta-viviendas/marbella-malaga/
I want to get a table with the price of each of the houses (class item_price), the number of rooms (class item_detail) and the sq. meters (class item_detail).
I believe I have to use the driver.find_elements() method, but I don't know where to add it and how to make sure we add all prices, rooms and sq. meters in a table with three columns.
I got this code so far but it doesn't work. I see FireFox going though pages but it seems it doesn't keep and store the data in houses.csv. Can anyone help? Thanks
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import csv
import time
driver = webdriver.Firefox()
driver.get("https://www.idealista.com/venta-viviendas/marbella/las-chapas-el-rosario/")
pages_remaining = True
price = []
rooms = []
size = []
while pages_remaining:
price = driver.find_elements_by_class_name("item-price")
rooms = driver.find_elements_by_xpath("//*[contains(text(), 'hab.')]")
size = driver.find_elements_by_xpath("//*[contains(text(), 'm²')]")
houses = [price, rooms, size]
try:
# Checks if there are more pages with links
next_link = driver.find_element_by_class_name("icon-arrow-right-after")
next_link.click()
time.sleep(10)
except NoSuchElementException:
rows_remaining = False
with open('houses.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerows(houses)
print(houses)

Categories

Resources