I am so new in this Python, Selenium and BeautifulSoup. I already saw many tutorials online but I am so confused. Please help me.
so basically this is my python code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
#import requests
import time
#import csv
passwordStr = '***'
usernameStr='***'
chrome_path = r'C:\Users\wana isa\geckodriver-v0.26.0-win64\geckodriver.exe'
browser = webdriver.Firefox(executable_path=r'C:\Users\wana isa\geckodriver-v0.26.0-win64\geckodriver.exe')
browser.get(('http://*********/'))
wait = WebDriverWait(browser,10)
# wait for transition then continue to fill items
#time.sleep(2)
password = wait.until(EC.presence_of_element_located((By.ID, 'txt_Password')))
password.send_keys(passwordStr)
username = wait.until(EC.presence_of_element_located((By.ID, 'txt_Username')))
username.send_keys(usernameStr)
signInButton = browser.find_element_by_id('button')
signInButton.click()
browser.get(('http://******'))
MainTab=browser.find_element_by_name('mainli_waninfo').click()
SubTab=browser.find_element_by_name('subli_bssinfo').click()
browser.switch_to.frame(browser.find_element_by_id('frameContent'))
html=browser.page_source
soup=bs(html,'lxml')
#print(soup.prettify())
#for Service Proversioning Status , This is the data that i scrape and need to be saved into csv
spsList=['ONT Registration Status','OLT Service Configuration Status','EMS Configuration Status','ACS Registration Status']
sps_id=['td1_2','td2_2','td3_2','td4_2']
for i in range(len(sps_id)):
elemntValu = browser.find_element_by_id(sps_id[i]).text
output= print(spsList[i] + " : "+ elemntValu)
browser.close()
this is the output:
i appreciate it so much if you can help me.
add this import to your code :
import csv
add the following to your code :
with open('FileName.csv', 'w', newline='') as file:
writer = csv.writer(file)
for i in range(len(sps_id)):
elemntValu = browser.find_element_by_id(sps_id[i]).text
output= print(spsList[i] + " : "+ elemntValu)
writer.writerow([spsList[i], elemntValu])
f.close()
browser.close()
Related
I am using 2captcha API to resolve Recaptcha V2 Invisible for my auto browsing. I have consulted and followed the instructions on the 2captcha homepage, youtube, and the StackOverflow community. Now after running I got a response and showed the results but can't verify the captcha. Can anybody please help me to find what I need to do to submit after the solved captcha code placed?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.proxy import *
from time import sleep
import time
import openpyxl
import requests
import pandas as pd
from solveRecaptcha import solveRecaptcha
# Load the Excel file
df = pd.read_excel('file.xlsx')
# Loop through the rows of the DataFrame
for index, row in df.iterrows():
# Print the current row
print('Current row: /n', row)
# Do your processing here
username = row['Name']
password = row['Pass']
# Create a new instance of the webdriver with the updated capabilities
# Use selenium to log in with the username and password
driver = webdriver.Chrome('./chromedriver')
driver.get("https://www.reddit.com/account/register/")
driver.find_element(By.XPATH, "/html/body/div/main/div[1]/div/div[2]/form/fieldset[2]/button").click()
sleep(3)
driver.find_element(By.XPATH, "/html/body/div/main/div[2]/div/div/div[2]/div[1]/form/fieldset[1]/input[2]").send_keys(username)
driver.find_element(By.CSS_SELECTOR, "input#regPassword").send_keys(password)
sleep(10)
result = solveRecaptcha(
"6LeTnxkTAAAAAN9QEuDZRpn90WwKk_R1TRW_g-JC",
"https://www.reddit.com/account/register/",
)
print(result)
sleep(10)
code = result('code')
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.ID, 'g-recaptcha-response'))
)
driver.execute_script(
"document.getElementByID('g-recaptcha-response').innerHTML = " + "'" + code + "'")
driver.find_element(By.ID, "recaptcha-verify-button").click()
sleep(77)
everyone, after getting help from #John Gordon then I know my mistake. Thank you, John!
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.proxy import *
from time import sleep
import time
import openpyxl
import requests
import pandas as pd
from solveRecaptcha import solveRecaptcha
# Load the Excel file
df = pd.read_excel('file.xlsx')
# Loop through the rows of the DataFrame
for index, row in df.iterrows():
# Print the current row
print('Current row: /n', row)
# Do your processing here
username = row['Name']
password = row['Pass']
# Create a new instance of the webdriver with the updated capabilities
# Use selenium to log in with the username and password
driver = webdriver.Chrome('./chromedriver')
driver.get("https://www.reddit.com/account/register/")
driver.find_element(By.XPATH, "/html/body/div/main/div[1]/div/div[2]/form/fieldset[2]/button").click()
sleep(3)
driver.find_element(By.XPATH, "/html/body/div/main/div[2]/div/div/div[2]/div[1]/form/fieldset[1]/input[2]").send_keys(username)
driver.find_element(By.CSS_SELECTOR, "input#regPassword").send_keys(password)
sleep(10)
result = solveRecaptcha(
"6LeTnxkTAAAAAN9QEuDZRpn90WwKk_R1TRW_g-JC",
"https://www.reddit.com/account/register/",
)
print(result)
sleep(10)
code = result['code']
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.ID, 'g-recaptcha-response'))
)
driver.execute_script(
"document.getElementByID('g-recaptcha-response').innerHTML = " + "'" + code + "'")
driver.find_element(By.ID, "recaptcha-verify-button").click()
sleep(77)
Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you. these is the page link https://www.askgamblers.com/online-casinos/countries/ca/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/ca/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for url in urls:
driver.get(url)
time.sleep(1)
try:
name=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
try:
company=driver.find_element(By.XPATH,"//p[span[contains(.,'Company')]]/following-sibling::div").text
except:
pass
try:
link=driver.find_element(By.XPATH,"//p[span[contains(.,'Website')]]/following-sibling::div").text
except:
pass
try:
rate=driver.find_element(By.CSS_SELECTOR,"span.rating-ring__number").text
except:
pass
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)
You open the same file for (over)writing with 'w' each time but loop over 3 pages. Use a different name or use 'a' (append) instead, but you will get the header three times as well with the current configuration.
Better would be to open the file for writing outside the for page loop, write the header, then inside for page write the rows.
Basically:
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for page in range(1,3):
... # compute the row info
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)
I want to learn python and for that I started with a small web scraping project.
I want to make a competitive scorecard for a travel agency, First of all here is the site link: tn.tunisiebooking.com
As you see, you have to fill out the form then a list of hotels will be displayed I managed to automate the search but I got stuck in the data extraction step, I don't know why it comes back and extract the data from the home page.
If you can help me and explain to me why it is going like this and thank you in advance. Here is the code I used:
import timer
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import requests
PATH="C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# write script
script = "document.getElementById('ville_des').value ='Sousse';document.getElementById('depart').value ='05/08/2021';document.getElementById('checkin').value ='05/08/2021';document.getElementById('select_ch').value = '1';"
# generate a alert via javascript
driver.execute_script(script)
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
print(driver.current_url)
r = requests.get(driver.current_url)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'bloc_titre'})
len(results)
records = []
for result in results:
nom = result.find('a').text
records.append((nom))
len(records)
import pandas as pd
df = pd.DataFrame(records, columns=['nom'])
df.head()
For more details, this is the home page :
HomePage
and this is the page i want to scrape it's open after that i send a form with my destination and date :
hotelList
the probleme that the output of my code is showing the liste of the home page not the second :
Output
I hope that i made it clear now, Thank you.
this will get the names of the hotels using selenium only
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# write script //Your Script Seems fine
script = "document.getElementById('ville_des').value ='Sousse';document.getElementById('depart').value ='05/08/2021';document.getElementById('checkin').value ='05/08/2021';document.getElementById('select_ch').value = '1';"
# generate a alert via javascript
driver.execute_script(script)
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
#getting the hotel names by xpath in a loop
for v in range(1, 20):
hotel_name = driver.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div/div[2]/div/div[4]/div[' + str(v) + ']/div/div[3]/div[1]/div[1]/span/a/h3').get_attribute('innerHTML')
print(hotel_name)
I don't know what other details you want but this is an example of hotel names based on your input
I was trying to scrape google maps. The phone and hours variable is not returning any data. Other variables work fine and return data. The XPATH is correct. I am not sure what's the issue here.
Here is the LINK
The other selectors like name, address, title, website return the data fine but phone and hours not returning any data.
Hoping for some answers.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from scrapy.selector import Selector
import csv
from tqdm import tqdm
import time
driver = webdriver.Firefox()
linksFile=open("links.txt",'r')
allLinks = linksFile.readlines()
for link in tqdm(allLinks):
try:
driver.get(link)
except Exception:
print('Something went wrong with the URL: ')
# time.sleep(15)
while True:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[contains(text(), "Directions")] | //div[contains(text(), "Website")]'))
)
results = driver.find_elements_by_xpath('//div[contains(text(), "Directions")] | //div[contains(text(), "Website")]')
for result in results:
# writing to the CSV file
outFile = open("data.csv",'a+',newline="")
writer = csv.writer(outFile)
business = driver.find_element_by_xpath('//div[#role="heading"]/div')
business.click()
# waiting for the page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[#class="immersive-container"]'))
)
# parcing response to the scrapy selector
response = Selector(text=driver.page_source)
name = response.xpath('//h2[#data-attrid="title"]/span/text()').get()
title = response.xpath('(//span[contains(text(), "Google reviews")])/parent::a/parent::span/parent::span/parent::div/parent::div/parent::div/following-sibling::div/div/span/span/text()').get()
address = response.xpath('//a[contains(text(), "Address")]/parent::span/following-sibling::span/text()').get()
website = response.xpath('(//a[contains(text(), "Website")])/#href').get()
phone = response.xpath('//a[contains(text(), "Phone")]/parent::span/following-sibling::span/a/span/text()').get()
hours = response.xpath('//a[contains(text(), "Hours")]/parent::span/following-sibling::div/label/span//btext()').get()
total_reviews = response.xpath('(//span[contains(text(), "Google reviews")])[1]/text()').get()
total_rating = response.xpath('(//span[contains(text(), "Google reviews")])/parent::a/parent::span/parent::span/parent::div/span/text()').get()
input('Check: ')
outFile = open("data.csv",'a+',newline="")
writer = csv.writer(outFile)
vals = [name, title, address, website, phone, hours, total_reviews, total_rating]
writer.writerow(vals)
outFile.close()
Can you use Java script outerHTML intead of pageSource.
response = Selector( driver.execute_script("return document.documentElement.outerHTML"))
Also there is an issue in xpath of Hours:
hours = response.xpath('//a[contains(text(), "Hours")]/parent::span/following-sibling::div/label/span//b/text()').get()
Try Google Maps link and not google search: https://www.google.com/maps/place/Leduc+Plumbing+and+Heating/#53.274672,-113.5486679,17z/data=!3m1!4b1!4m5!3m4!1s0x539ff9a5d31a87c9:0xf494d91aafd55e55!8m2!3d53.2746688!4d-113.5464739
IT should be more stable.
I am learning on how to extract data from websites now and have managed to get alot of information. However for my next website I am failing for some unknown reason as nothing is saved to the text files nor do I get any output in print. Here is my piece of code:
import json
import urllib.request
from bs4 import BeautifulSoup
import requests
url = 'https://www.jaffari.org/'
request = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html.decode("utf-8"), "html.parser")
table = soup.find('div', attrs={"class":"textwidget"})
name = table.text.encode('utf-8').strip()
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name, outfile)
print (name)
Can anyone help please?
The prayer times are rendered by java-scripts therefore you need to use browser tool like selenium to load the page and then use beautiful soup to get the data.
You need to download compatible ChromeDriver from this link and passed the chrome driver path as i have provided.
Code here to fetch name and prayer times and saved in a text file.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
with open('testPrayers.txt', 'w') as outfile:
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
outfile.write(name + " " + time[0] + "\n")
print(name + " " + time[0])
outfile.close()
print('Done')
Updated data with different file name.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
print(name + " " + time[0])
with open(name+'.txt', 'w') as outfile:
outfile.write(time[0])
outfile.close()
print('Done')
The name variable needs to be a string rather than a bytes object. Try with
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name.decode(), outfile)
print (name.decode())
Hope it helps.