HCaptcha Bypass with Selenium & Python - python

I'm trying to build a voting bot for a french minecraft server.
The thing is that there is a hCaptcha and I'm using 2Captcha to bypass it.
Everything is working perfectly I got the Token from 2Captcha and I insert that same token inside the textarea of the Captcha.
But .. when submitting the form and process the vote. It seems like it's not working as expected. After digging for about 3 days I couldn't find any solutions.
import os
from twocaptcha import TwoCaptcha
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# ====================
# GLOBAL PARAM
# ====================
driver = webdriver.Chrome(executable_path="./driver/chromedriver.exe")
original_window = driver.current_window_handle
driver.get("https://serveur-prive.net/minecraft/seasonsky-skyblock-100-farmtowin-1-18-crack-on-6330/vote")
def HCaptchaSolver():
def solver():
api_key = os.getenv('APIKEY_2CAPTCHA', '2CAPTCHA_API_KEY')
solver = TwoCaptcha(api_key)
try:
result = solver.hcaptcha(
sitekey='c6b0b71f-47cc-4512-b4df-a55e1a97a349',
url='https://serveur-prive.net/minecraft/seasonsky-skyblock-100-farmtowin-1-18-crack-on-6330/vote',
)
except Exception as e:
print(e)
return False
else:
return result
result = solver()
if result:
code = result['code']
print(code)
driver.execute_script(
"document.querySelector(" + "'" + '[name="h-captcha-response"]' + "'" + ").style = " + "'" + "block" + "'")
driver.execute_script(
"document.querySelector(" + "'" + '[name="h-captcha-response"]' + "'" + ").innerHTML = " + "'" + code + "'")
print("/!\ --> JS Code executed")
driver.find_element(
By.CSS_SELECTOR, "#btnvote").click()
else :
print("/!\ --> 2Captcha not working :p")
#wait for iframe to load
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#c > div > div > div.bvt > div.col > div.vote > div.form > div > div > iframe')))
#HCaptchaSolver()
note that you can recreate my problem by creating a 2Captcha account and solving a few captcha in order to get paid few cents and try it out
Thanks for the help !
Roy

Related

Unable to print foodpanda product links using selenium python

As i want to extract link from a href tag but it no print any result from https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants
from selenium import webdriver
driver = webdriver.Chrome('F:/chromedriver')
driver.get("https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants")
# response = scrapy.Selector(text=driver.page_source)
list = driver.find_elements_by_css_selector("ul.vendor-list li")
length = len(driver.find_elements_by_css_selector("ul.vendor-list li"))
for i in range(length):
try:
name = driver.find_elements_by_css_selector(".headline .name")[i].text
time = driver.find_elements_by_css_selector(".badge-info")[i].text.strip()
rating = driver.find_elements_by_css_selector(".rating")[i].text
dealtag = driver.find_elements_by_css_selector(".multi-tag")[i].text
link = driver.find_elements_by_css_selector(".vendor [href]")[i].text
print(name,link,time,rating,dealtag)
except:
pass
Please read the code, This code is working fine in my computer.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 30)
driver.get('https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants')
Vendor_list = driver.find_elements_by_xpath("//figure[#class=\"vendor-tile item\"]/ancestor::li")
for vendor in Vendor_list:
print("-------------------")
print("Restaurant Name :- " + vendor.find_element_by_xpath(".//span[#class=\"name fn\"]").text)
print("Badge :- " + vendor.find_element_by_xpath(".//span[#class=\"badge-info\"]").text[:2] +
vendor.find_element_by_xpath(".//span[#class=\"badge-info\"]/span").text)
try:
print("Rating :- " + vendor.find_element_by_xpath(".//span[#class=\"rating\"]").text)
except:
print("No Rating Available")
try:
print("Muti Tag :- " + vendor.find_element_by_xpath(".//span[#class=\"multi-tag\"]").text)
except:
print("No Tag Info")
print("Vendor URL :- " + vendor.find_element_by_xpath(".//a").get_attribute("href"))
If it solves your problem then please mark it as answer.
There are no elements with exact class name vendor there.
You should use something like //*[contains(#class,'vendor')]//a[#href]
I used Xpath since I prefer working with it, but you can also use similar css_selector

scraping with selenium cant click on clickable text

I am trying to scrape some data from yahoo finance, for each stock, I want to get the historical data. Taking the Apple stock. I should go to https://finance.yahoo.com/quote/AAPL/history?p=AAPL and choose "MAX" from "Time Period". so
I believe the script I wrote so far is getting the date element, but somehow clicking on it to be able to choose "MAX" is not working.
here is my whole script:
# using linux here
project_path = os.getcwd()
driver_path = project_path + "/" + "chromedriver"
yahoo_finance = "https://finance.yahoo.com/quote/"
driver = webdriver.Chrome(driver_path)
def get_data(symbol='AAPL'):
stock_history_link = yahoo_finance + symbol + '/history?p=' + symbol
driver.get(stock_history_link)
date_picker = '//div[contains(#class, "D(ib)") and contains(#class, "Pos(r)") and contains(#class, "Cur(p)")' \
'and contains(#class, "O(n):f")]'
try:
print("I am inside")
date_picker_2 = "//div[#class='Pos(r) D(ib) O(n):f Cur(p)']"
date_picker_element = driver.find_element_by_xpath(date_picker_2)
print("date_picker_element: ", date_picker_element)
date_picker_element.click()
try:
print("I will be waiting for the date")
my_dropdown = WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.ID, 'dropdown-menu'))
)
print(my_dropdown)
print("I am not waiting anymore")
except TimeoutException as e:
print("wait timed out")
print(e)
except WebDriverException:
print("Something went wrong while trying to pick the max date")
if __name__ == '__main__':
try:
get_data()
except:
pass
# finally:
# driver.quit()
To click the button with Max just open it up and target it.
driver.get("https://finance.yahoo.com/quote/AAPL/history?p=AAPL")
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.XPATH, "//span[#class='C($linkColor) Fz(14px)']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[#data-value='MAX']"))).click()
Element:
<button class="Py(5px) W(45px) Fz(s) C($tertiaryColor) Cur(p) Bd Bdc($seperatorColor) Bgc($lv4BgColor) Bdc($linkColor):h Bdrs(3px)" data-value="MAX"><span>Max</span></button>
Imports:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
You have the wrong xpath for the date_picker_2:
date_picker_2 = '//*[#id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[1]/div[1]/div/div/div/span'
Using requests:
import requests
import datetime
end = int(datetime.datetime.strptime(datetime.date.today().isoformat(), "%Y-%m-%d").timestamp())
url = f"https://finance.yahoo.com/quote/AAPL/history?period1=345427200&period2={end}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
requests.get(url)
Gets you to the same end page.

How do I export data from multiple pages into a csv file?

I am working on a scraping project, and am in the final stages. Right now, my code can navigate to the first profile, scrape the data from that profile, print that data, then move on to the next profile, and repeat the process. Now, I want to put the data I collect into a csv file instead of printing it. I am not sure how to do this, so I am looking for guidance/updates to my current code. Thank you for your help!
My current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome("/Users/nzalle/Downloads/chromedriver")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Profiles to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
Data = (Name + " , " + IssuedBy + " , " + CertificationorDesignaationNumber + " , " + CertfiedorDesignatedSince + " , " + AccreditedBy + " , " + Expires)
print(Data)
driver.close()
driver.switch_to.window(driver.window_handles[0])
driver.close()

Selenium python not clickable element at point(X, Y) instead other element would receive the click(here the element neither a button nor a link)

This is a link to an e-commerce website which I would like to crawl. I am searching for a way to click on Most Helpful, positive, negative, most recent and by certified buyers section and scrape the values. Heads up, it's not a button so ActionChains and Javascript code is not working on it.
I want to move from one to another either by using click or with any other methods. I tried By using javascript executor and ActionChains but I am unable to get it.
For this My Xpath is:
path = '//div[#class="o5jqS-"]/div[X]//div[contains(#class,"_3MuAT6")]'
which actually returns an element. The "X" value is replaced in a loop with 1 to 5. 1 signifying "Most helpful" and 5 signifying "By Certfied Buyers"
My code is below:
for j in range(0,5):
new_xpath = xpath_hash["FirstPageReviews"]["TitleOfReviewType"].replace("[X]", "[" + str(j + 1) + "]")
new_xpath1 = xpath_hash["FirstPageReviews"]["TitleElement"].replace("[X]", "[" + str(j + 1) + "]")
title_element = driver.find_element_by_xpath(new_xpath1)
driver.execute_script("arguments[0].click();", title_element)
#ActionChains(driver).move_to_element(title_element).click().perform()
you could use my code base on page object, i have tried this worked
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
class SeleniumBaseClass(object):
def __init__(self,driver):
self.driver = driver
def open(self,URL):
self.driver.get(URL)
def driverURLChange(self,URL):
print("change URL" + URL)
self.driver.get(URL)
def currentUrl(self):
print("URL " + self.driver.current_url)
return self.driver.current_url
def locateElement(self, loc):
try:
print(loc)
element = WebDriverWait(self.driver,10).until(EC.visibility_of_element_located(loc))
return element
except:
print ("cannot find {0} element".format(loc))
return None
def waitForElementInvisible(self,loc):
#load-spinner
try:
element = WebDriverWait(self.driver,10).until(EC.invisibility_of_element_located(loc))
return True
except:
print ("cannot invisibility_of_element {0} element".format(loc))
return False
def send_key_with_Element(self,loc,value):
self.locateElement(loc).clear()
self.locateElement(loc).send_keys(value)
def click_with_Element(self,loc):
self.locateElement(loc).click()
def clickElementsBySendKey(self,loc,value):
self.locateElement(loc).send_keys(value)
customdriver = SeleniumBaseClass(webdriver.Chrome())
customdriver.open("https://www.flipkart.com/sony-mdr-zx110-wired-headphones/p/itmehuh6zm9s7kgz?pid=ACCDZRSEYPFHAT76&srno=s_1_1&otracker=search&lid=LSTACCDZRSEYPFHAT76TANM1F&qH=a684a6245806d98f")
HelpfulTab = (By.XPATH,"//div[contains(text(),'Most Helpful')]")
PositiveTab = (By.XPATH,"//div[contains(text(),'Positive')]")
customdriver.click_with_Element(PositiveTab)

Scraping with Beautifulsoup-Python

I want to scrape the name of the hotel in the tripadvisor in each review page of the hotel.
I wrote a code in python which is very simple and I think that it isn't false.
But every time it stops at a different point(page for example the first time stopped in page 150 second time in the page 330).
I am 100% that my code are correct. Is there any possibility that tripadvisor block me every time?
I update the code and i use selenium too but the problem is still remain
The updated code is the following:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import os
import urllib.request
import time
import re
file2 = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews2.csv"), "wb")
file2.write(b"hotel,Address,HelpCount,HotelCount,Reviewer" + b"\n")
Checker ="REVIEWS"
# example option: add 'incognito' command line arg to options
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
# create new instance of chrome in incognito mode
browser = webdriver.Chrome(executable_path='/Users/thimios/AppData/Local/Google/chromedriver.exe', chrome_options=option)
#print(browser)
# go to website of interest
for i in range(10,50,10):
Websites=["https://www.tripadvisor.ca/Hotel_Review-g190479-d3587956-Reviews-or"+str(i)+"-The_Thief-Oslo_Eastern_Norway.html#REVIEWS"]
print(Websites)
for theurl in Websites:
thepage=browser.get(theurl)
thepage1 = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage1, "html.parser")
# wait up to 10 seconds for page to load
timeout = 5
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="HEADING"]')))
#print(WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="HEADING"]'))))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
# Extract the helpful votes, hotel reviews
helpcountarray = ""
hotelreviewsarray = ""
for profile in soup.findAll(attrs={"class": "memberBadging g10n"}):
image = profile.text.replace("\n", "|||||").strip()
#print(image)
if image.find("helpful vote") > 0:
counter = re.findall('\d+', image.split("helpful vote", 1)[0].strip()[-4:])
if len(helpcountarray) == 0:
helpcountarray = [counter]
else:
helpcountarray.append(counter)
elif image.find("helpful vote") < 0:
if len(helpcountarray) == 0:
helpcountarray = ["0"]
else:
helpcountarray.append("0")
print(helpcountarray)
#print(len(helpcountarray))
if image.find("hotel reviews") > 0:
counter = re.findall('\d+', image.split("hotel reviews", 1)[0].strip()[-4:])
if len(hotelreviewsarray) == 0:
hotelreviewsarray = counter
else:
hotelreviewsarray.append(counter)
elif image.find("hotel reviews") < 0:
if len(hotelreviewsarray) == 0:
hotelreviewsarray = ['0']
else:
hotelreviewsarray.append("0")
print(hotelreviewsarray)
#print(len(hotelreviewsarray))
hotel_element = browser.find_elements_by_xpath('//*[#id="HEADING"]')
Address_element = browser.find_elements_by_xpath('//*[#id="HEADING_GROUP"]/div/div[3]/address/div/div[1]')
for i in range(0,10):
print(i)
for x in hotel_element:
hotel = x.text
print(hotel)
#print(type(hotel))
for y in Address_element:
Address = y.text.replace(',', '').replace('\n', '').strip()
print(Address)
#print(type(Address))
HelpCount = helpcountarray[i]
HelpCount = " ".join(str(w) for w in HelpCount)
print(HelpCount)
#print(type(HelpCount))
HotelCount = hotelreviewsarray[i]
HotelCount = " ".join(str(w) for w in HotelCount)
print(HotelCount)
#print(type(HotelCount))
Reviewer = soup.findAll(attrs={"class": "username mo"})[i].text.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip()
print(Reviewer)
Record2 = hotel + "," + Address +"," + HelpCount +"," + HotelCount+"," +Reviewer
if Checker == "REVIEWS":
file2.write(bytes(Record2, encoding="ascii", errors='ignore') + b"\n")
file2.close()
I read somewhere that I should add a header. Something like
headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
in order for the web site to allow me to scrape it. Is that true?
Thanks for your help
Yes. there is such a possibility.
Websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages.
The default User-Agent typically refers to automated processes implemented using a python software, so you will want to change it to browser like User-Agent.
Even though, I do not believe you were blocked by TripAdvisor.
Try to slow down the downloading by
import time
...
time.sleep(1)
No, try REAL life slowing it down, using Backoff so the target website doesn't think you're a bot...
import time
for term in ["web scraping", "web crawling", "scrape this site"]:
t0 = time.time()
r = requests.get("http://example.com/search", params=dict(
query=term
))
response_delay = time.time() - t0
time.sleep(10*response_delay) # wait 10x longer than it took them to respond
source:
https://blog.hartleybrody.com/web-scraping-cheat-sheet/#delays-and-backing-off

Categories

Resources