I've created a script for web scraping and I'm using 2Captcha to solve captchas. 2Captcha has a Python library, but I've created my own functions to generate the captcha ID and captcha token code.
My captcha module has 3 functions: get_captcha_id(), get_captcha_response(), and apply_token()
Everything works great, and I'm able to sovled a couple dozen captchas until eventually I get the 2 following error:
ERROR_WRONG_CAPTCHA_ID
When this happens, the script first comes to the error ERROR_CAPTCHA_UNSOLVABLE, then the loop goes back and generates an entire new captcha ID. Maybe I should keep the same ID and just generate a new token?
I just want to know if there's a better way to do this anyway...
Here is the code to start the 2Captcha on my main script:
captcha_solved = 0
#Solves recpacha via 2Captcha API
while captcha_solved == 0:
captcha_id = captcha.get_captcha_id(browser.current_url)
if captcha_id != 0 or captcha_id != None:
print("Captcha ID is: "+str(captcha_id))
cap_res = captcha.get_captcha_response(captcha_id)
if cap_res == "ERROR_CAPTCHA_UNSOLVABLE" or cap_res == "ERROR_TOKEN_EXPIRED" or cap_res == "ERROR_WRONG_CAPTCHA_ID":
print("Captcha failed... Restarting captcha")
browser.refresh()
sleep(1)
continue
else:
print("Capcha Token: "+cap_res)
captcha.apply_token(browser, cap_res)
solver.report(captcha_id, True)
captcha_solved = captcha_solved + 1
break
Once this while loop is complete, the main script will start. After about 2 dozen captcha or so, I'll receive this error:
Traceback (most recent call last):
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\main.py", line 191, in <module>
cap_res = captcha.get_captcha_response(captcha_id)
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\captcha.py", line 83, in get_captcha_response
solver.report(cap_id, False)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\solver.py", line 496, in report
self.api_client.res(key=self.API_KEY, action=rep, id=id_)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\api.py", line 113, in res
raise ApiException(resp)
twocaptcha.api.ApiException: ERROR_WRONG_CAPTCHA_ID
I've thought I added enough failsafes to be able to regenerate a Captcha Token
Here is my captcha.py file code:
from twocaptcha import TwoCaptcha
from random import randint
from time import sleep
from urllib.request import urlopen, Request
import re
from bs4 import BeautifulSoup
from twocaptcha.solver import ValidationException
from twocaptcha.api import NetworkException, ApiException
from selenium.common.exceptions import TimeoutException
#solver = TwoCaptcha('API_KEY')
site_key = "###"
api_key = "###"
config = {
'server': '2captcha.com',
'apiKey': api_key,
'callback': 'https://your.site.com/',
'defaultTimeout': 120,
'recaptchaTimeout': 600,
'pollingInterval': 10,
}
proxy={
'type': 'HTTP',
'uri': '###'
}
user_agent = '###'
solver = TwoCaptcha(**config)
print("2Captcha Balance: $"+str(solver.balance()))
def get_captcha_id(captcha_url):
try:
result = solver.recaptcha(sitekey=site_key, url=captcha_url, proxy=proxy)
#print(result)
split_string = str(result).split(":", 1)
substring = split_string[0]
#print(substring)
if (substring == "{'captchaId'"):
strip_beginning = re.sub("{'captchaId': '", "", str(result))
captcha_id = re.sub("'}", "", strip_beginning)
return captcha_id
else:
print("could not find captcha ID")
return 0
except ValidationException as e:
# invalid parameters passed
print(e)
return e
except NetworkException as e:
# network error occurred
print(e)
return e
except ApiException as e:
# api respond with error
print(e)
return e
except TimeoutException as e:
# captcha is not solved so far
print(e)
return e
def get_captcha_response(cap_id):
capcha_ready = 0
response_url = "https://2captcha.com/res.php?key="+api_key+"&action=get&id="+cap_id
while capcha_ready == 0:
PageRequest = Request(response_url,data=None,headers={'User-Agent': user_agent})
PageResponse = urlopen(PageRequest)
PageHtml = PageResponse.read()
PageSoup = BeautifulSoup(PageHtml, 'html.parser')
SoupText = str(PageSoup)
if SoupText == "ERROR_CAPTCHA_UNSOLVABLE" or SoupText == "ERROR_WRONG_CAPTCHA_ID" or SoupText == "ERROR_TOKEN_EXPIRED":
solver.report(cap_id, False)
return SoupText
elif str(PageSoup) == "CAPCHA_NOT_READY":
print("Waiting for capcha response...")
rand = randint(12,18)
print("sleeping for "+str(rand)+" seconds")
sleep(rand)
else:
split_string = str(PageSoup).split("|", 1)
if len(split_string) > 0:
substring = split_string[1]
return substring
capcha_ready = capcha_ready + 1
#print(PageSoup)
return PageSoup
def apply_token(browser, token):
print("Applying token to browser...")
browser.execute_script('document.getElementById("g-recaptcha-response").innerHTML = "{}";'.format(token))
print("Token applied")
Thanks for your help for this, I really appreciate it!
Related
So, I am scraping a webpage and I have a element on the page where it displays an integer, when I scrape that element, i store the plaintext in a variable, then each time it scrapes, i compare the variable to what the plaintext is on the webpage. I am not sure if maybe i need to get a request to the webpage each time?
from win10toast import ToastNotifier
from _overlapped import NULL
from plyer import notification
import requests
from bs4 import BeautifulSoup
toaster = ToastNotifier()
toaster.show_toast("Notification!", "Alert!", threaded=True, icon_path=NULL, duration=3)
URL = "https://rocketleague.tracker.network/rocket-league/profile/steam/76561198074072333/mmr?playlist=13"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
_title = ""
_message = ""
recent_mmr = "111"
def get_mmr(url):
results = soup.find_all(class_="stat")
for stat in results :
titles = stat.find_all(class_="label")
for t in titles :
if(t.text.strip() == "Rating"):
val = stat.find(class_="value").text.strip()
return val
def get_rank(url):
results = soup.find(class_="stat tier")
rank = results.find(class_="label")
return rank.text.strip()
_message = "Rank: " + get_rank(URL) + "\n" + "MMR: " + get_mmr(URL)
recent_mmr = get_mmr(URL)
import time
while toaster.notification_active():
time.sleep(0.1)
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
print(recent_mmr)
recent_mmr = get_mmr(URL)
while True:
print('running')
#page = requests.get(URL)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print(mmr_temp +"(temp mmr)")
if mmr_temp == recent_mmr:
print("No update, recent MMR: " + recent_mmr)
mmr_temp = recent_mmr
time.sleep(60)
else:
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
time.sleep(60)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print("Updated, recent MMR: " + recent_mmr)
You're scraping the webpage to get the recent_mmr number, copying that to mmr_temp, and then immediately comparing to see if they're equal -- well of course they are, because you just copied it!
You need to reorganize the loop a little bit, and copy the mmr variable at the bottom of the loop:
previous_mmr = None
while True:
recent_mmr = get_mmr()
if recent_mmr != previous_mmr:
print("mmr changed")
previous_mmr = recent_mmr
I'm currently trying to put two things together when checking multiple websites from my input CSV file:
Check HTTP status
Check if Website displays specific keyword
then save the results to a new CSV file.
My input.csv:
id url
1 https://example123.com
2 https://envato.com/blog/30-outstanding-coming-soon-and-under-construction-website-templates/
3 https://mundoshoponline.com
My Code:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('path/to/my/input.csv')
#my csv has urls in the 1st column
urls = df.T.values.tolist()[1]
results = {}
status = []
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
#all keywords to check on the website
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Forbidden": soup.body.findAll(text = re.compile("Forbidden", re.I)),
"Page not found": soup.body.findAll(text = re.compile("Page not found", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I))}
results[url] = data
#check for http status and save to status list
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
status.append("Down")
except requests.exceptions.HTTPError:
status.append("Other")
else:
status.append("OK")
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
forbiddenList= []
notfoundList = []
underList = []
currentlyList = []
#mark x if there are any hits for specific keyword
for url in results:
comingList.append("x" if len(results[url]["coming soon"]) > 0 else "")
openingList.append("x" if len(results[url]["Opening Soon"]) > 0 else "")
forbiddenList.append("x" if len(results[url]["Forbidden"]) > 0 else "")
notfoundList.append("x" if len(results[url]["Page not found"]) > 0 else "")
underList.append("x" if len(results[url]["Under Construction"]) > 0 else "")
currentlyList.append("x" if len(results[url]["Currently Unavailable"]) > 0 else "")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["forbidden"] = pd.DataFrame(forbiddenList, columns=['forbidden'])
df["notfound2"] = pd.DataFrame(notfoundList, columns=['notfound2'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["currentlyUnavailable"] = pd.DataFrame(currentlyList, columns=['currentlyUnavailable'])
df['status'] = status
print(df)
df.to_csv('path/to/my/output.csv', index=False)
However, whenever I run the above script with for url in urls:
for some of my urls it throws this error and script breaks and output.csv is not generated:
Traceback (most recent call last):
File "path/to/myscan.py", line 51, in <module>
comingList.append("x" if len(results[url]["coming soon"]) > 0 else "")
KeyError: 'http://example123.com'
and when running it with for url in results: output.csv is as follows:
[![enter image description here][1]][1]
seems erroneous as first row has keywords marked as present (comingSoon, underConstruction columns) + status column = Down. But website doesn't contain 'coming soon' or 'under construction' strings.
Would someone be able to help me with this? I believe there might be an issue in my loop or try/except part of the code. I'm happy to provide more information if the above is not sufficient. Thank you in advance.
I think your main problem is that you are iterating over the whole urls which some of which may have failed and therefore does not exist in your results as a key.
A much safer way to do this is to iterate over the subset of urls that you are sure have succeeded and have a key in results, so instead of
for url in urls:
you could make it
for url in results:
To make the final results consistent with the input order of your urls:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('./input.csv')
#my csv has urls in the 4th column
urls = [ 'example123.com', 'https://envato.com/blog/30-outstanding-coming-soon-and-under-construction-website-templates/', 'http://alotechgear.com']
results = {}
status = {}
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
#all keywords to check on the website
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Forbidden": soup.body.findAll(text = re.compile("Forbidden", re.I)),
"Page not found": soup.body.findAll(text = re.compile("Page not found", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I))}
results[url] = data
#check for http status and save to status list
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.MissingSchema):
status[url] = "Down"
except requests.exceptions.HTTPError:
status[url] = "Other"
else:
status[url] = "OK"
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
forbiddenList= []
notfoundList = []
underList = []
currentlyList = []
statusList = []
#mark x if there are any hits for specific keyword
for url in urls:
if(not results.get(url)):
statusList.append(status.get(url))
notfoundList.append("x")
comingList.append("-")
openingList.append("-")
forbiddenList.append("-")
underList.append("-")
currentlyList.append("-")
else:
statusList.append(status.get(url))
comingList.append("x" if len(results[url].get("coming soon")) > 0 else "-")
openingList.append("x" if len(results[url].get("Opening Soon")) > 0 else "-")
forbiddenList.append("x" if len(results[url].get("Forbidden")) > 0 else "-")
notfoundList.append("x" if len(results[url].get("Page not found")) > 0 else "-")
underList.append("x" if len(results[url].get("Under Construction")) > 0 else "-")
currentlyList.append("x" if len(results[url].get("Currently Unavailable")) > 0 else "-")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["forbidden"] = pd.DataFrame(forbiddenList, columns=['forbidden'])
df["notfound2"] = pd.DataFrame(notfoundList, columns=['notfound2'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["currentlyUnavailable"] = pd.DataFrame(currentlyList, columns=['currentlyUnavailable'])
df['status'] = pd.DataFrame(statusList, columns=['Status'])
print(df)
df.to_csv('./output.csv', index=False)
sample result:
id url comingSoon openingSoon forbidden notfound2 underConstruction currentlyUnavailable status
0 1 https://example123.com - - - x - - Down
1 2 https://envato.com/blog/30-outstanding-c... x - - - x - OK
2 3 https://mundoshoponline.com - - - x - - Down
I'm building a python script that gathers data from Instagram, based on a user list provided in my database. However, I'm running into some issues trying to handle unexpected JSON response.
To give some context, the program is fetching a username from my database table (24/7, looping over hundreds of accounts - hence the while True: loop), requesting a URL with that username, and expecting a certain JSON response (specifically, it's looking for ['entry_data']['ProfilePage'][0] in the response).
However when usernames aren't found on Instagram, the JSON is different, and the expected part (['entry_data']['ProfilePage'][0]) is not in there. So my script crashes.
With the current code:
def get_username_from_db():
try:
with connection.cursor() as cursor:
cursor.execute("SELECT * FROM ig_users_raw WHERE `username` IS NOT NULL ORDER BY `ig_users_raw`.`last_checked` ASC LIMIT 1")
row = cursor.fetchall()
username = row[0]['username']
except pymysql.IntegrityError:
print('ERROR: ID already exists in PRIMARY KEY column')
return username
def request_url(url):
try:
response = requests.get(url)
except requests.HTTPError:
raise requests.HTTPError(f'Received non 200 status code from {url}')
except requests.RequestException:
raise requests.RequestException
else:
return response.text
def extract_json_data(url):
try:
r = requests.get(url, headers=headers)
except requests.HTTPError:
raise requests.HTTPError('Received non-200 status code.')
except requests.RequestException:
raise requests.RequestException
else:
print(url)
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
return j
if __name__ == '__main__':
while True:
sleep(randint(5,15))
username = get_username_from_db()
url = f'https://www.instagram.com/{username}/'
j = extract_json_data(url)
json_string = json.dumps(j)
user_id = j['graphql']['user']['id']
username = j['graphql']['user']['username']
#print(user_id)
try:
with connection.cursor() as cursor:
db_data = (json_string, datetime.datetime.now(),user_id)
sql = "UPDATE `ig_users_raw` SET json=%s, last_checked=%s WHERE `user_id`= %s "
cursor.execute(sql, db_data)
connection.commit()
print(f'{datetime.datetime.now()} - data inserted for user: {user_id} - {username}')
except pymysql.Error:
print('ERROR: ', pymysql.Error)
I'm getting the following error/traceback:
https://www.instagram.com/geloria.itunes/
Traceback (most recent call last):
File "D:\Python\Ministry\ig_raw.py", line 63, in <module>
j = extract_json_data(url)
File "D:\Python\Ministry\ig_raw.py", line 55, in extract_json_data
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\__init__.py", line 354, in loads
return _default_decoder.decode(s)
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
Ideally, I want this to just skip past the account (in this case geloria.itunes), and move to the next one in the database. I might want to remove the account, or at least remove the username from the row.
In an effort to solve this myself, I experimented with if / else loops, but in the case where it would continue, I'd just be looping over the same account.
Do you have any suggestions on how I can tackle this specific issue?
Thanks!
First of all you need to figure out why exception occurred.
The reason why you're getting this error is because you're telling json to parse invalid (non-JSON) string.
Just run this example with URL you've provided in traceback:
import re
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.instagram.com/geloria.itunes/")
print(r.status_code) # outputs 404(!)
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
print(stringified_json)
# j = json.loads(stringified_json) # will raise an exception
Output:
\n(function(){\n function normalizeError(err) {\n...
...
stringify(normalizedError));\n })\n }\n })\n}());
As you can see stringified_json is not a valid JSON string.
As you mentioned it is invalid because this instagram page is hidden or does not exist (HTTP status code is 404 Not Found). And you're passing the wrong response to json.loads() because you have no checks for response status code in your script.
The following except clauses did not catch "404 case" because you've received a valid HTTP response therefore there is no exception to raise:
except requests.HTTPError:
raise requests.HTTPError('Received non-200 status code.')
except requests.RequestException:
raise requests.RequestException
So basically you have 2 ways to deal with this issue:
check for response HTTP status code manually like if r.status_code != 200 ...
or use raise_for_status() method to throw an exception if 400 <= r.status_code < 600
I might want to remove the account, or at least remove the username from the row.
Well, your question here sounds a bit vague. I can just give an idea.
For example - if 404 page encountered, you can raise your custom exception when dealing with response, catch it later in __main__, delete record from database and continue with other pages:
class NotFoundError(Exception):
""" my custom exception for not found pages """
pass
... # other functions
def extract_json_data(url):
r = requests.get(url, headers=headers)
if r.status_code == 404:
raise NotFoundError() # page not found
# if any other error occurs (network unavailable for example) - an exception will be raised
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
return json.loads(stringified_json)['entry_data']['ProfilePage'][0]
if __name__ == '__main__':
while True:
sleep(randint(5, 15))
username = get_username_from_db()
url = f'https://www.instagram.com/{username}/'
try:
j = extract_json_data(url)
except NotFoundError:
delete_user_from_db(username) # implement: DELETE FROM t WHERE username = ...
continue # proceed for next user page
# rest of your code:
# json_string = json.dumps(j)
# user_id = j['graphql']['user']['id']
# ...
Python code get stuck in the try block
`
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
#import urllib2
def url1_to_string(url1):
html=""
proxyDict = {
'http': 'http://username:pwd#proxyurl:8080',
'https': 'https://username:pwd#proxyurl:8080'
}
try:
print('Before res in try')
res = requests.get(url1,proxies=proxyDict)
print('After res in try')
except:
pass
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]
for index, row in df.iterrows():
agg_url_list = []
ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
ny_bb1 = url1_to_string(ini_url)
anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))
account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
print(anchor_count)
soup = BeautifulSoup(r.text,"html.parser")
get_details1 = soup.find_all("div", attrs={"class": "g"})
sublist1 = []
for details1 in get_details1:
link1 = details1.find_all("h3")
for mdetails1 in link1[:]:
links1 = mdetails1.find_all("a")
lmk1 = ""
for lnk1 in links1[:]:
lmk1 = lnk1.get("href")[7:].split("&")
sublist1.append(lmk1[0])
aggregate_count1=0
for x1 in sublist1[:3]:
anchorcount1=0
accountcount1=0
print("aagg url",x1)
try:
print('In try block')
ny_bb1 = url1_to_string(x1)
except KeyboardInterrupt: print('You cancelled the operation.')
finally:
pass
ny_bb1=ny_bb1.upper()
print(ny_bb1)
row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
if(anchor_name==account_name):
if(row['ANCHOR_NAME'] in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
else:
if (anchor_name in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(account_name in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
if(anchorcount1 > 0 and accountcount1 > 0):
aggregate_count1=aggregate_count1+1
agg_url_list.append(x1[:])
print("existance of both",aggregate_count1)
aggregate_page_count.append(aggregate_count1)
agg_url_count.append(agg_url_list)
df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)
`
The contents of the abc.csv file as follows ::
ANCHOR_NAME,ACCOUNT_NAME
ABC,ABC
XYZ,ZYZ
and so on
For particular URL's the code gets stuck in the try block and control does not come to except block where I want to ignore the exception and continue with normal program flow, as executing the next URL's and so on.
used to sent attachment through selenium, using self._attach_and_send_screenshot() funtion to autogenertation.
Enter anything after scanning QR code
Traceback (most recent call last):
File "wht.py", line 21, in
attach_and_send_screenshot()
NameError: name 'attach_and_send_screenshot' is not defined
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, ElementNotVisibleException
from urllib.parse import quote_plus
driver = webdriver.Chrome()
driver.get('https://web.whatsapp.com/')
all_names = ['Anas Cse']
msg = 'testing'
count = 1
input('Enter anything after scanning QR code')
for name in all_names:
user = driver.find_element_by_xpath('//span[#title = "{}"]'.format(name))
user.click()
msg_box = driver.find_element_by_class_name('_2S1VP')
for i in range(count):
self._attach_and_send_screenshot()
# msg_box.send_keys(msg)
# button = driver.find_element_by_class_name('_2lkdt')
# button.click()
def _attach_and_send_screenshot(self):
# TODO - ElementNotVisibleException - this shouldn't happen but when would it
# local variables for x_path elements on browser
attach_xpath = '//*[#id="main"]/header/div[3]/div/div[2]/div'
send_file_xpath = '//*[#id="app"]/div/div/div[1]/div[2]/span/div/span/div/div/div[2]/span[2]/div/div'
if self.attachment_type == "img":
attach_type_xpath = '//*[#id="main"]/header/div[3]/div/div[2]/span/div/div/ul/li[1]/input'
elif self.attachment_type == "cam":
attach_type_xpath = '//*[#id="main"]/header/div[3]/div/div[2]/span/div/div/ul/li[2]/button'
elif self.attachment_type == "doc":
attach_type_xpath = '//*[#id="main"]/header/div[3]/div/div[2]/span/div/div/ul/li[3]/input'
try:
# open attach menu
attach_btn = driver.find_element_by_xpath(attach_xpath)
attach_btn.click()
# Find attach file btn and send screenshot path to input
time.sleep(1)
attach_img_btn = driver.find_element_by_xpath(attach_type_xpath)
# TODO - might need to click on transportation mode if url doesn't work
attach_img_btn.send_keys(os.getcwd() + "/screenshot.png") # get current script path + img_path
time.sleep(1)
send_btn = driver.find_element_by_xpath(send_file_xpath)
send_btn.click()
# close attach menu
time.sleep(1)
attach_btn = driver.find_element_by_xpath(attach_xpath)
attach_btn.click()
except (NoSuchElementException, ElementNotVisibleException) as e:
print(str(e))
send_message((str(e)))
send_message("Bot failed to retrieve search content, try again...")
def send_message(msg):
whatsapp_msg = driver.find_element_by_class_name('_2S1VP')
whatsapp_msg.send_keys(msg)
whatsapp_msg.send_keys(Keys.ENTER)
move the function definitions before your main logic. You are trying to call a function that has not been defined yet