I am learning on how to extract data from websites now and have managed to get alot of information. However for my next website I am failing for some unknown reason as nothing is saved to the text files nor do I get any output in print. Here is my piece of code:
import json
import urllib.request
from bs4 import BeautifulSoup
import requests
url = 'https://www.jaffari.org/'
request = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html.decode("utf-8"), "html.parser")
table = soup.find('div', attrs={"class":"textwidget"})
name = table.text.encode('utf-8').strip()
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name, outfile)
print (name)
Can anyone help please?
The prayer times are rendered by java-scripts therefore you need to use browser tool like selenium to load the page and then use beautiful soup to get the data.
You need to download compatible ChromeDriver from this link and passed the chrome driver path as i have provided.
Code here to fetch name and prayer times and saved in a text file.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
with open('testPrayers.txt', 'w') as outfile:
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
outfile.write(name + " " + time[0] + "\n")
print(name + " " + time[0])
outfile.close()
print('Done')
Updated data with different file name.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
print(name + " " + time[0])
with open(name+'.txt', 'w') as outfile:
outfile.write(time[0])
outfile.close()
print('Done')
The name variable needs to be a string rather than a bytes object. Try with
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name.decode(), outfile)
print (name.decode())
Hope it helps.
Related
I have a question, I'm making myself a portfolio app (scrapping + Python) and I can't figure out two things,
How to download thumbnails of photos?
https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944
I had a code that downloaded after XPath:
//*[#id="root"]/div[1]/div[2]/form/div[5]/div/div[2]/div[{counter}]/a/div/div/div[1]/div[1]/div
and it was working fine and since 2 days I get the information
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element...
and I'm pulling my hair out of my head... as for the code, I'm just learning Scrapping so maybe it's not the best solution (maybe JS changes something dynamically?), How to get these thumbnails?
Second question: Which DIV to click a Scrapping to enter a specific car listing? also I'm lost in this because you can click both the price and the photo.... and one enters (example code: driver.find_element(By.CLASS_NAME, 'css-8wsg1m').click())
Thank you very much for your help.
import request
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
option = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
driver.get('https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944')
driver.find_element(By.ID, 'onetrust-accept-btn-handler').click() # Cookies
img=driver.find_element(By.CLASS_NAME, 'css-gl6djm')
imgURL=img.get_attribute('src')
savedImageName='Image.jpg'
downloadImage(imgURL,savedImageName)
name = r'/home/.../Pulpit/GitHub_Public/Selenium_Porsche/work_dir/Image.jpg'
url = r'https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944'
downloadImage(url, name)
The Easiest way to achieve this is by creating an imageDownload() function which takes the image URL as an input and it downloads the image to your machine. You can easily get the image URL with the help of selenium.
import request
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
img=driver.driver.find_element(By.CLASS_NAME, 'CLASS_NAME_HERE')
imgURL=img.get_attribute('src')
savedImageName='Image.jpg'
downloadImage(imgURL,savedImageName)
Your Xpath is wrong, Here's the final code:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
option = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
driver.get('https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944')
driver.find_element(By.ID, 'onetrust-accept-btn-handler').click() # Cookies
img = driver.find_element(
By.XPATH, '//*[#id="root"]/div[1]/div[2]/form/div[5]/div/div[2]/div[2]/a/div/div/div[1]/div[1]/div/img')
imgURL = img.get_attribute('src')
print(imgURL)
savedImageName = 'Image.jpg'
downloadImage(imgURL, savedImageName)
Output:
Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you. these is the page link https://www.askgamblers.com/online-casinos/countries/ca/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/ca/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for url in urls:
driver.get(url)
time.sleep(1)
try:
name=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
try:
company=driver.find_element(By.XPATH,"//p[span[contains(.,'Company')]]/following-sibling::div").text
except:
pass
try:
link=driver.find_element(By.XPATH,"//p[span[contains(.,'Website')]]/following-sibling::div").text
except:
pass
try:
rate=driver.find_element(By.CSS_SELECTOR,"span.rating-ring__number").text
except:
pass
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)
You open the same file for (over)writing with 'w' each time but loop over 3 pages. Use a different name or use 'a' (append) instead, but you will get the header three times as well with the current configuration.
Better would be to open the file for writing outside the for page loop, write the header, then inside for page write the rows.
Basically:
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for page in range(1,3):
... # compute the row info
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)
I am so new in this Python, Selenium and BeautifulSoup. I already saw many tutorials online but I am so confused. Please help me.
so basically this is my python code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
#import requests
import time
#import csv
passwordStr = '***'
usernameStr='***'
chrome_path = r'C:\Users\wana isa\geckodriver-v0.26.0-win64\geckodriver.exe'
browser = webdriver.Firefox(executable_path=r'C:\Users\wana isa\geckodriver-v0.26.0-win64\geckodriver.exe')
browser.get(('http://*********/'))
wait = WebDriverWait(browser,10)
# wait for transition then continue to fill items
#time.sleep(2)
password = wait.until(EC.presence_of_element_located((By.ID, 'txt_Password')))
password.send_keys(passwordStr)
username = wait.until(EC.presence_of_element_located((By.ID, 'txt_Username')))
username.send_keys(usernameStr)
signInButton = browser.find_element_by_id('button')
signInButton.click()
browser.get(('http://******'))
MainTab=browser.find_element_by_name('mainli_waninfo').click()
SubTab=browser.find_element_by_name('subli_bssinfo').click()
browser.switch_to.frame(browser.find_element_by_id('frameContent'))
html=browser.page_source
soup=bs(html,'lxml')
#print(soup.prettify())
#for Service Proversioning Status , This is the data that i scrape and need to be saved into csv
spsList=['ONT Registration Status','OLT Service Configuration Status','EMS Configuration Status','ACS Registration Status']
sps_id=['td1_2','td2_2','td3_2','td4_2']
for i in range(len(sps_id)):
elemntValu = browser.find_element_by_id(sps_id[i]).text
output= print(spsList[i] + " : "+ elemntValu)
browser.close()
this is the output:
i appreciate it so much if you can help me.
add this import to your code :
import csv
add the following to your code :
with open('FileName.csv', 'w', newline='') as file:
writer = csv.writer(file)
for i in range(len(sps_id)):
elemntValu = browser.find_element_by_id(sps_id[i]).text
output= print(spsList[i] + " : "+ elemntValu)
writer.writerow([spsList[i], elemntValu])
f.close()
browser.close()
Here is my full code. I want to get output data on csv such as title, price everything will be separated column on csv or excel spreadsheet. My code going to details page of each product and collecting my necessary information such as product title, price etc.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
#argument for incognito Chrome
option = Options()
option.add_argument("--incognito")
browser = webdriver.Chrome(options=option)
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='c16H9d']")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
//getting link of each product
soup = BeautifulSoup(browser.page_source, "html.parser")
product_items = soup.find_all("div", attrs={"data-qa-locator": "product-item"})
for item in product_items:
item_url = f"https:{item.find('a')['href']}"
print(item_url)
browser.get(item_url)
//scrape details page information
itm_soup = BeautifulSoup(browser.page_source, "html.parser")
container_box = itm_soup.find_all("div",{"id":"container"})
# Use the itm_soup to find details about the item from its url.
for itm in container_box:
product_title_element = itm.find("span",class_="pdp-mod-product-badge-title")
product_title = product_title_element.get_text() if product_title_element else "No title"
print(product_title)
browser.quit()
how I will get product title on csv or excel spreadsheet ?
You can use the csv writer module to accomplish this.
from csv import writer
def AddToCSV(List):
with open("Output.csv", "a+", newline='') as output_file:
csv_writer = writer(output_file)
csv_writer.writerow(List)
# this can be used within your for loop
row_list = [item_url, product_title, price, etc..]
AddToCSV(row_list)
I am using BeautifulSoup for extracting pictures which works well for normal pages.
Now I want to extract the picture of the Chromebook from a web page like this
https://twitter.com/banprada/statuses/829102430017187841
The page apparently contains a link to another page with the image. Here is my code for downloading an image from mentioned link but I am only getting the image of the person who posted the link.
import urllib.request
import os
from bs4 import BeautifulSoup
URL = "http://twitter.com/banprada/statuses/829102430017187841"
list_dir="D:\\"
default_dir = os.path.join(list_dir,"Pictures_neu")
opener = urllib.request.build_opener()
urllib.request.install_opener(opener)
soup = BeautifulSoup(urllib.request.urlopen(URL).read())
imgs = soup.findAll("img",{"alt":True, "src":True})
for img in imgs:
img_url = img["src"]
filename = os.path.join(default_dir, img_url.split("/")[-1])
img_data = opener.open(img_url)
f = open(filename,"wb")
f.write(img_data.read())
f.close()
Is there an opportunity to download the image somehow?
Many thanks and regards,
Andi
This is how you can get only mentioned image using Selenium + requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
link = 'https://twitter.com/banprada/statuses/829102430017187841'
driver = webdriver.PhantomJS()
driver.get(link)
wait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[starts-with(#id, 'xdm_default')]")))
image_src = driver.find_element_by_tag_name('img').get_attribute('src')
response = requests.get(image_src).content
with open('C:\\Users\\You\\Desktop\\Image.jpeg', 'wb') as f:
f.write(response)
If you want to get all the images from all iframes on page (excluding images on initial page source that you can get with your code):
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import requests
import time
link = 'https://twitter.com/banprada/statuses/829102430017187841'
driver = webdriver.Chrome()
driver.get(link)
time.sleep(5) # To wait until all iframes completely rendered. Might be increased
iframe_counter = 0
while True:
try:
driver.switch_to_frame(iframe_counter)
pictures = driver.find_elements_by_xpath('//img[#src and #alt]')
if len(pictures) > 0:
for pic in pictures:
response = requests.get(pic.get_attribute('src')).content
with open('C:\\Users\\You\\Desktop\\Images\\%s.jpeg' % (str(iframe_counter) + str(pictures.index(pic))), 'wb') as f:
f.write(response)
driver.switch_to_default_content()
iframe_counter += 1
except WebDriverException:
break
Note, that you can use any webdriver