Extract data from site and different pages and save to csv - python

I am working with selenium webdriver to search and extract data from this site https://www.idealista.com/venta-viviendas/marbella-malaga/
I want to get a table with the price of each of the houses (class item_price), the number of rooms (class item_detail) and the sq. meters (class item_detail).
I believe I have to use the driver.find_elements() method, but I don't know where to add it and how to make sure we add all prices, rooms and sq. meters in a table with three columns.
I got this code so far but it doesn't work. I see FireFox going though pages but it seems it doesn't keep and store the data in houses.csv. Can anyone help? Thanks
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import csv
import time
driver = webdriver.Firefox()
driver.get("https://www.idealista.com/venta-viviendas/marbella/las-chapas-el-rosario/")
pages_remaining = True
price = []
rooms = []
size = []
while pages_remaining:
price = driver.find_elements_by_class_name("item-price")
rooms = driver.find_elements_by_xpath("//*[contains(text(), 'hab.')]")
size = driver.find_elements_by_xpath("//*[contains(text(), 'm²')]")
houses = [price, rooms, size]
try:
# Checks if there are more pages with links
next_link = driver.find_element_by_class_name("icon-arrow-right-after")
next_link.click()
time.sleep(10)
except NoSuchElementException:
rows_remaining = False
with open('houses.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerows(houses)
print(houses)

Related

Want to scraping titles, dates, links, and content from IOL website but can't

I am new to web scraping, and I am trying to scrape the titles, dates, links, and contents of news articles on this website: https://www.iol.co.za/news/south-africa/eastern-cape.
The titles of the articles have different class names and heading (h) tag. I was able to scrape the dates, links, and titles using h tag. However, when I tried to store them in a pandas dataframe, I received the following errors-> ValueError: All arrays must be of the same length.
I also wrote the code to get the content of each article using the links. I got an error as well. I will thankful if I can be assisted.
I have tried different options to scrape the titles by creating a list of the different class names, but to no avail.
Please see my code below:
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from datetime import timedelta
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
art_title = [] # to store the titles of all news article
art_date = [] # to store the dates of all news article
art_link = [] # to store the links of all news article
pagesToGet = ['south-africa/eastern-cape']
for i in range(0, len(pagesToGet)):
print('processing page : \n')
url = 'https://www.iol.co.za' + str(pagesToGet[i])
print(url)
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
#time.sleep(5) # allow you to sleep your code before your retrieve the elements from the webpage. Additionally, to
# prevent the chrome driver opening a new instance for every url, open the browser outside of the loop.
# an exception might be thrown, so the code should be in a try-except block
try:
# use the browser to get the url. This is suspicious command that might blow up.
driver.get("https://www.iol.co.za/news/" +str(pagesToGet[i]))
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print('ERROR FOR LINK:', url) # print the link that cause the problem
print(error_type, 'Line:', error_info.tb_lineno) # print error info and line that threw the exception
continue # ignore this page. Abandon this and go back.
time.sleep(3) # Allow 3 seconds for the web page to open
# Code to scroll the screen to the end and click on more news till the 15th page before scraping all the news
k = 1
while k<=2:
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
driver.find_element(By.CSS_SELECTOR, '.Articles__MoreFromButton-sc-1mrfc98-0').click()
k += 1
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
news = soup.find_all('article', attrs={'class': 'sc-ifAKCX'})
print(len(news))
# Getting titles, dates, and links
for j in news:
# Article title
title = j.findAll(re.compile('^h[1-6]'))
for news_title in title:
art_title.append(news_title.text)
# Article dates
dates = j.find('p', attrs={'class': 'sc-cIShpX'})
if dates is not None:
date = dates.text
split_date = date.rsplit('|', 1)[1][10:].rsplit('<', 1)[0]
art_date.append(split_date)
# Article links
address = j.find('a').get('href')
news_link = 'https://www.iol.co.za' + address
art_link.append(news_link)
df = pd.DataFrame({'Article_Title': art_title, 'Date': art_date, 'Source': art_link})
# Getting contents
new_articles = ...struggling to write the code
df['Content'] = news_articles
df.to_csv('data.csv')
driver.quit()
I think this is what you are looking for:
# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
# For every article we take what we want
for article in articles:
header = article.find_element(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']")
print(header.get_attribute('textContent'))
author_and_date = article.find_elements(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']/following-sibling::p[1]")
if author_and_date:
print(author_and_date[0].get_attribute('textContent'))
else:
print("No author found")
link = article.find_element(By.XPATH, f".//a")
print(link.get_attribute('href'))

with open nested loop in selenium isn't working

it's my first time posting here, so please let me know if I've messed anything up. I'm having some trouble with a nested loop in selenium. I'm trying to iterate through a list of players, gather stats for each one, and add them to a dataframe. Right now each player in the list gets entered into the search bar and their page is displayed, but stats are only collected for the last player in the list.
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
You have wrong indentation for the lines where you initialize the data dict, and append to dataframe1. they must be at the same level as the block under the innermost for loop.
I modified your code, here:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
Thank you everyone for your assistance with this issue. I eventually found out that I did not need the 'button' variable or button.click() in the script. Send keys was already hitting "return" after the string was passed to the search parameter on the page, so basically return was getting hit twice, once on the players name, and once on an empty search parameter. The default page that was returned when searching for the empty parameter did not contain the element I was attempting to find, which resulted in an empty list. Again thank you for your help with this issue.

How to run 'implicity_wait()' in a 'for loop' with respect to Web Scraping using Python?

Actually, I want to scrape the 'title' and 'product description' for all the products and from all the pages, and then save it into the '.csv' file.
URL:- hhttps://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
This is what, I have tried.
from msilib.schema import Error
from os import sep
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
title_list = []
para_list = []
expiry_list = []
country_list = []
importer_list = []
address_list = []
myDict = {'body-art': 3024}
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window()
browser.implicitly_wait(20)
for item_name in myDict:
page_num = 1
while True:
try:
page = f"https://www.nykaa.com/makeup/{item_name}/c/{myDict[item_name]}?page_no={page_num}&sort=popularity&ptype=lst&id={myDict[item_name]}&root=nav_2&dir=desc&order=popularity&eq=desktop"
print(page)
requests.get(page)
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
urls = [item.get("href")
for item in soup.find_all("a", class_="css-qlopj4")]
# print(urls)
if len(urls) == 0:
break
for i in range(0, 2): #Since, it's a huge amount of data, that's why I have taken 2 products on one page, otherwise it will be in the range(0,30). It will cover all the products from an individual pages.
try:
url = urls[i]
browser.get("https://www.nykaa.com" + url)
title_data = browser.find_elements(
By.CLASS_NAME, 'css-1gc4x7i').text
print(title_data)
for t in title_data:
title_list.append(t)
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(
By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[1]').text
para_list.append(para_details)
expiry = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[2]').text
expiry_list.append(expiry)
country = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[3]').text
country_list.append(country)
importer = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[4]').text
importer_list.append(importer)
address = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[5]').text
address_list.append(address)
except:
break
except:
break
page_num += 1
title_list = [i.split('.css', 1)[0] for i in title_list]
print(*title_list, sep="\n")
print(*para_list, sep="\n")
print(*expiry_list, sep="\n")
print(*country_list, sep="\n")
print(*importer_list, sep="\n")
print(*address_list, "\n")
data_new = {"Title": title_list, "Para": para_list, "Expiry": expiry_list,
"Country": country_list, "Importer": importer_list, "Address": address_list}
df = pd.DataFrame(data_new)
df.to_csv("nykaa_makeup_bodyArt_new.csv")
# print(df)
The Output, I am receiving is as:
DevTools listening on ws://127.0.0.1:30887/devtools/browser/a222842a-7ce3-4070-a684-7e8bb8772279
https://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=2&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=3&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=4&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=5&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
PS E:\Web Scraping - Nykaa>
I think, due to the implicity_wait() function, it's not able to fetch the product's title & description. After my code runs, the '.csv' file is created, but it's a blank file. Maybe, I am wrong. Please help me regarding this. Do I need change to add/change some parts of the code?
Thanks 🙏🏻
There is no need to set browser.implicitly_wait multiple times.
browser.implicitly_wait is setting the timeout, how much time the driver will try to pool the DOM in order to locate an element on the page before it races exception.
browser.implicitly_wait is normally set per driver session.
This is definetely not a pause command like time.sleep.
So, in case you need to put a pause in your code you should use time.sleep while this is not recommended.
Also, it's much preferably to use Expected Conditions explicit waits rather than browser.implicitly_wait since browser.implicitly_wait waits for element presence i.e. it will release the run when element is just appeared while it may not be completely rendered.
In order to wait for element completely rendered and containing it text you should use something like
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")))
Where "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]" is XPath of element you wishe to get the text from.

unable to store scraped data into excel file

Hii guys I was scraping this data and my code is working fine but I am not able to understand how to store this scraped data into a excel file and I am getting more confused after looking at others solution and answers
here is my code
import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
import openpyxl
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
##### Web scraper for infinite scrolling page #####
driver = webdriver.Chrome(executable_path='./chromedriver.exe')
driver.get("https://www.zomato.com/ncr/south-delhi-restaurants/fast-food?rating_range=4.0-5.0&category=2")
time.sleep(10) # Allow 2 seconds for the web page to open
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
count=0
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
page_sources= BeautifulSoup(driver.page_source, "html.parser")
title=driver.find_elements_by_xpath("//a[#class='sc-jHZirH intUsQ']")
for i in title:
count += 1
name =i.find_element_by_xpath('./div/h4')
address = i.find_element_by_xpath('./p[1]')
#data = openpyxl.load_workbook('Bakery.xlsx')
df = pd.DataFrame({"Bakery Restaurants": name,"Address":address})
print(count)
driver.close()
this is the main block of code
page_sources= BeautifulSoup(driver.page_source, "html.parser")
title=driver.find_elements_by_xpath("//a[#class='sc-jHZirH intUsQ']")
for i in title:
count += 1
name =i.find_element_by_xpath('./div/h4')
address = i.find_element_by_xpath('./p[1]')
#data = openpyxl.load_workbook('Bakery.xlsx')
df = pd.DataFrame({"Bakery Restaurants": name,"Address":address})
print(count)
driver.close()
please help me out, I am too confused ,any suggestion/solution will be appreciated
Assuming your scraping is working correctly. You need to store name and address to the excel
# scraping code
#
#
names = [] # Create lists to hold your columns
addresses = [] # Create lists to hold your columns
for i in title:
count += 1
name =i.find_element_by_xpath('./div/h4')
names.append(name)
address = i.find_element_by_xpath('./p[1]')
addresses.append(address)
df = pd.DataFrame({'Names': names, 'Addresses': addresses}) # Create a DF with the lists
with pd.ExcelWriter('output.xlsx') as writer:
df.to_excel(writer, sheet_name='Sheet1')
Note: Make sure the datatype of the scraped values is string or a type supported by pandas and excel. And not a python object.
Docs: Insert into DataFrame
DataFrame to Excel
this is my attempt and working fine
names=[]
addresses=[]
for i in title:
count += 1
name =i.find_element_by_xpath('./div/h4').text
address = i.find_element_by_xpath('./p[1]').text
names.append(name)
addresses.append(address)
df = pd.DataFrame({'Names': names, 'Addresses': addresses})
writer = pd.ExcelWriter('pandas_simple.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()

Scraping with bs4 and selenium, each loop returns the same data

I'm pretty new to web scraping and am trying to scrape backdated data from timeanddate.com and output it to a csv. I'm using Selenium to get the data table for each date.
My code:
from bs4 import BeautifulSoup
from selenium import webdriver
import csv
def getData (url, month, year):
driver = webdriver.Chrome('C:/Users/adam/Desktop/chromedriver.exe')
driver.get(url)
Data = []
soup = BeautifulSoup(driver.page_source, "lxml")
for i in driver.find_element_by_id("wt-his-select").find_elements_by_tag_name("option"):
i.click()
table = soup.find('table', attrs={'id':'wt-his'})
for tr in table.find('tbody').find_all('tr'):
dict = {}
dict['time'] = tr.find('th').text.strip()
all_td = tr.find_all('td')
dict['humidity'] = all_td[5].text
Data.append(dict)
fileName = "output_month="+month+"_year="+year+".csv"
keys = Data[0].keys()
with open(fileName, 'w') as result:
dictWriter = csv.DictWriter(result, keys)
dictWriter.writeheader()
dictWriter.writerows(Data)
year_num = int(input("Enter your year to collect data from: "))
month_num = 1
year = str(year_num)
for i in range (0,12):
month = str(month_num)
url = "https://www.timeanddate.com/weather/usa/new-york/historic?month="+month+"&year="+year
data = getData(url, month, year)
print (data)
month_num += 1
The table I'm trying to scrape data from is weather data and I want to get the humidity data from each day in the month.
The program cycles through the months but the output is the data for Mon, 1 Jan. Although the date changes in-browser, the same data is appended to the file each time (current output) rather than each new day being appended (desired output). I can't work out why it does this and any help fixing it would be much appreciated.
The problem is that you parse the website only once even though the site changes with each date selection. However, it is not enough to move parsing inside the for loop as it is also necessary to wait until the page is loaded before starting re-parsing.
Below is my solution. There are two things to note:
I am making use of the WebDriverWait and expected_conditions provided built-in with Selenium
I prefer finding by CSS selectors, which greatly simplifies syntax. This awesome game can help you learn them
# Necessary imports
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def getData (url, month, year):
driver = webdriver.Chrome('C:/Users/adam/Desktop/chromedriver.exe')
driver.get(url)
wait = WebDriverWait(driver, 5);
Data = []
for opt in driver.find_elements_by_css_selector("#wt-his-select option"):
opt.click()
# wait until the table title changes to selected date
wait.until(EC.text_to_be_present_in_element((By.ID, 'wt-his-title'), opt.text))
for tr in driver.find_elements_by_css_selector('#wt-his tbody tr'):
dict = {}
dict['time'] = tr.find_element_by_tag_name('th').text.strip()
# Note that I replaced 5 with 6 as nth-of-xxx starts indexing from 1
dict['humidity'] = tr.find_element_by_tag_name('td:nth-of-type(6)').text.strip()
Data.append(dict)
# continue with csv handlers ...

Categories

Resources