I just got an information that sncrape python still error, I tried to solve it by add top = True in line twi***scraper but it still error.
Here is my codes:
pd.options.display.max_colwidth = 500
query = "(music) lang:en since:2023-01-01 until:2023-02-02"
tweets = []
limit = 10
get_ipython().run_line_magic('time', '')
try:
print("start scraping")
for tweet in sntwitter.TwitterSearchScraper(query=query, top = True).get_items():
if len(tweets) == limit :
break
else:
tweets.append([tweet.date, tweet.user.username, tweet.content])
df = pd.DataFrame(tweets, columns=['datetime', 'username', 'content'])
except Exception as e:
print(e)
print("Finished")
print("-------")
Can somebody solved the error?
Error retrieving https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%28Gempa%29+lang%3Aid++since%3A2023-01-01+until%3A2023-02-02&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel: non-200 status code
4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%28Gempa%29+lang%3Aid++since%3A2023-01-01+until%3A2023-02-02&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel failed, giving up.
Related
I've created a script for web scraping and I'm using 2Captcha to solve captchas. 2Captcha has a Python library, but I've created my own functions to generate the captcha ID and captcha token code.
My captcha module has 3 functions: get_captcha_id(), get_captcha_response(), and apply_token()
Everything works great, and I'm able to sovled a couple dozen captchas until eventually I get the 2 following error:
ERROR_WRONG_CAPTCHA_ID
When this happens, the script first comes to the error ERROR_CAPTCHA_UNSOLVABLE, then the loop goes back and generates an entire new captcha ID. Maybe I should keep the same ID and just generate a new token?
I just want to know if there's a better way to do this anyway...
Here is the code to start the 2Captcha on my main script:
captcha_solved = 0
#Solves recpacha via 2Captcha API
while captcha_solved == 0:
captcha_id = captcha.get_captcha_id(browser.current_url)
if captcha_id != 0 or captcha_id != None:
print("Captcha ID is: "+str(captcha_id))
cap_res = captcha.get_captcha_response(captcha_id)
if cap_res == "ERROR_CAPTCHA_UNSOLVABLE" or cap_res == "ERROR_TOKEN_EXPIRED" or cap_res == "ERROR_WRONG_CAPTCHA_ID":
print("Captcha failed... Restarting captcha")
browser.refresh()
sleep(1)
continue
else:
print("Capcha Token: "+cap_res)
captcha.apply_token(browser, cap_res)
solver.report(captcha_id, True)
captcha_solved = captcha_solved + 1
break
Once this while loop is complete, the main script will start. After about 2 dozen captcha or so, I'll receive this error:
Traceback (most recent call last):
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\main.py", line 191, in <module>
cap_res = captcha.get_captcha_response(captcha_id)
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\captcha.py", line 83, in get_captcha_response
solver.report(cap_id, False)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\solver.py", line 496, in report
self.api_client.res(key=self.API_KEY, action=rep, id=id_)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\api.py", line 113, in res
raise ApiException(resp)
twocaptcha.api.ApiException: ERROR_WRONG_CAPTCHA_ID
I've thought I added enough failsafes to be able to regenerate a Captcha Token
Here is my captcha.py file code:
from twocaptcha import TwoCaptcha
from random import randint
from time import sleep
from urllib.request import urlopen, Request
import re
from bs4 import BeautifulSoup
from twocaptcha.solver import ValidationException
from twocaptcha.api import NetworkException, ApiException
from selenium.common.exceptions import TimeoutException
#solver = TwoCaptcha('API_KEY')
site_key = "###"
api_key = "###"
config = {
'server': '2captcha.com',
'apiKey': api_key,
'callback': 'https://your.site.com/',
'defaultTimeout': 120,
'recaptchaTimeout': 600,
'pollingInterval': 10,
}
proxy={
'type': 'HTTP',
'uri': '###'
}
user_agent = '###'
solver = TwoCaptcha(**config)
print("2Captcha Balance: $"+str(solver.balance()))
def get_captcha_id(captcha_url):
try:
result = solver.recaptcha(sitekey=site_key, url=captcha_url, proxy=proxy)
#print(result)
split_string = str(result).split(":", 1)
substring = split_string[0]
#print(substring)
if (substring == "{'captchaId'"):
strip_beginning = re.sub("{'captchaId': '", "", str(result))
captcha_id = re.sub("'}", "", strip_beginning)
return captcha_id
else:
print("could not find captcha ID")
return 0
except ValidationException as e:
# invalid parameters passed
print(e)
return e
except NetworkException as e:
# network error occurred
print(e)
return e
except ApiException as e:
# api respond with error
print(e)
return e
except TimeoutException as e:
# captcha is not solved so far
print(e)
return e
def get_captcha_response(cap_id):
capcha_ready = 0
response_url = "https://2captcha.com/res.php?key="+api_key+"&action=get&id="+cap_id
while capcha_ready == 0:
PageRequest = Request(response_url,data=None,headers={'User-Agent': user_agent})
PageResponse = urlopen(PageRequest)
PageHtml = PageResponse.read()
PageSoup = BeautifulSoup(PageHtml, 'html.parser')
SoupText = str(PageSoup)
if SoupText == "ERROR_CAPTCHA_UNSOLVABLE" or SoupText == "ERROR_WRONG_CAPTCHA_ID" or SoupText == "ERROR_TOKEN_EXPIRED":
solver.report(cap_id, False)
return SoupText
elif str(PageSoup) == "CAPCHA_NOT_READY":
print("Waiting for capcha response...")
rand = randint(12,18)
print("sleeping for "+str(rand)+" seconds")
sleep(rand)
else:
split_string = str(PageSoup).split("|", 1)
if len(split_string) > 0:
substring = split_string[1]
return substring
capcha_ready = capcha_ready + 1
#print(PageSoup)
return PageSoup
def apply_token(browser, token):
print("Applying token to browser...")
browser.execute_script('document.getElementById("g-recaptcha-response").innerHTML = "{}";'.format(token))
print("Token applied")
Thanks for your help for this, I really appreciate it!
I can't close the pop-up window,I wrote a function that in fact should close the window, well, it does not close, and even more so even the exception does not work immediately throws the same error
I will be grateful for your help, I have already spent a large amount of time in different ways, it does not help, and I tried to find it through xpatch and through the class
#bot.message_handler(commands=['start'])
def start(message):
bot.send_message(message.chat.id, "Создаеться прайс")
b = 0
# Эльдорадо
#while True:
link_eldo = "https://www.mvideo.ru/product-list-page-cls?q=redmi&limit=12®ion_id=1&category_id=cat2_cis_0000000357"
print (requests.get(link_eldo))
# requests_url = requests.get(link_eldo)
# if requests_url.status_code == 403:
# print("Получилось")
# options.add_argument(f"user-agent={user_agent.random}")
driver.get(link_eldo)
def func (i):
i = i + 1
return i
def close_banner(browser):
wait(browser, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "close"))).click()
#time.sleep(20)
name_eldo_product = driver.find_elements_by_xpath("/html/body/div[2]/div[1]/div[4]/div[2]/div[3]/div/div[1]/div[1]/div['func(0)']/div[3]/div/div[1]/h4/a")
price_eldo_product = driver.find_elements_by_class_name("fl-product-tile-price__current")
name_product = []
price_product = []
for name in name_eldo_product:
name_product.append(name.text)
for price in price_eldo_product:
price = price.text
price = re.sub('[\W_]+', '', price)
rub = "р"
for clear in rub:
price = price.replace(clear,"")
price_product.append(int(price))
print(price_product)
сreat_dic = dict(zip(name_product, price_product))
main_products = {}
main_products.update(сreat_dic)
print (main_products)
link_eldo = "https://www.mvideo.ru/product-list-page-cls?q=iphone12&limit=12®ion_id=1&category_id=cat2_cis_0000000357"
print (requests.get(link_eldo))
# requests_url = requests.get(link_eldo)
# if requests_url.status_code == 403:
# print("Получилось")
# options.add_argument(f"user-agent={user_agent.random}")
driver.get(link_eldo)
try:
close_banner(driver)
name_eldo_product = driver.find_elements_by_xpath("/html/body/div[2]/div[1]/div[4]/div[2]/div[3]/div/div[1]/div[1]/div['func(0)']/div[3]/div/div[1]/h4/a")
price_eldo_product = driver.find_elements_by_class_name("fl-product-tile-price__current")
except NoSuchElementException:
#spelling error making this code not work as expected
pass
except StaleElementReferenceException:
close_banner(driver)
pass
except TimeoutException:
print('poput')
I am having the following structure in my script:
id_list = [1, 2, 3]
for id in id_list:
data = get_data(id)
meta = get_metadata(id)
# If there is a response, continue:
if((data.json()) and (data.json())['job_status']=='SUCCESS'):
# Do stuff
else:
print('Id is not found')
Here is get_data() script:
def get_data(form_id):
survey_found = False
try:
print("------- Getting data from Source. Please wait. -------\n")
print("------- Getting data from Source. Please wait. -------\n", file=logfile)
response.raise_for_status()
print(response.content)
survey_found = True
return response
except (RuntimeError, TypeError, NameError, snowCtx.connection.errors.Error, requests.exceptions.HTTPError) as e:
print("******* Error from Source (GETTING DATA): *******\n"+ str(e)+" on form id: "+str(form_id))
print("******* Error from Source (GETTING DATA): *******\n"+ str(e)+ str(e)+" on form id: "+str(form_id), file=logfile)
survey_found = False
return survey_found
I don't care about get_meta() as the condition is on get_data()
The problem is if the first id was not available the code will stop executing because the except part will throw an http error.
I need the script to continue over other IDs in the list.
id_list = [1, 2, 3]
for id in id_list:
data = get_data(id)
if isinstance(data, bool) and not data:
print(f"skipping {id}...")
continue
meta = get_metadata(id)
if((data.json()) and (data.json())['job_status']=='SUCCESS'):
# Do stuff
else:
print('Id is not found')
I am trying to web scrape news articles by certain keywords. I use Python 3. However, I am not able to get all the articles from the newspaper. After scraping some articles as output in the csv file I get ArticleException error. Could anyone help me with this? Ideally, I would like to solve the problem and download all the related articles from the newspaper website. Otherwise, it would also be useful to just skip the URL that shows error and continue from the next one. Thanks in advance for your help.
This is the code I am using:
import urllib.request
import newspaper
from newspaper import Article
import csv, os
from bs4 import BeautifulSoup
import urllib
req_keywords = ['coronavirus', 'covid-19']
newspaper_base_url = 'http://www.thedailystar.net'
category = 'country'
def checkif_kw_exist(list_one, list_two):
common_kw = set(list_one) & set(list_two)
if len(common_kw) == 0: return False, common_kw
else: return True, common_kw
def get_article_info(url):
a = Article(url)
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
output_file = "J:/B/output.csv"
if not os.path.exists(output_file):
open(output_file, 'w').close()
for index in range(1,50000,1):
page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())
primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})
for tag in primary_tag:
url = tag.find("a")
#print (url)
url = newspaper_base_url + url.get('href')
result = get_article_info(url)
if result is not False:
with open(output_file, 'a', encoding='utf-8') as f:
writeFile = csv.writer(f)
writeFile.writerow(result)
f.close
else:
pass
This is the error I am getting:
---------------------------------------------------------------------------
ArticleException Traceback (most recent call last)
<ipython-input-1-991b432d3bd0> in <module>
65 #print (url)
66 url = newspaper_base_url + url.get('href')
---> 67 result = get_article_info(url)
68 if result is not False:
69 with open(output_file, 'a', encoding='utf-8') as f:
<ipython-input-1-991b432d3bd0> in get_article_info(url)
28 a = Article(url)
29 a.download()
---> 30 a.parse()
31 a.nlp()
32 success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
~\Anaconda3\lib\site-packages\newspaper\article.py in parse(self)
189
190 def parse(self):
--> 191 self.throw_if_not_downloaded_verbose()
192
193 self.doc = self.config.get_parser().fromstring(self.html)
~\Anaconda3\lib\site-packages\newspaper\article.py in throw_if_not_downloaded_verbose(self)
530 elif self.download_state == ArticleDownloadState.FAILED_RESPONSE:
531 raise ArticleException('Article `download()` failed with %s on URL %s' %
--> 532 (self.download_exception_msg, self.url))
533
534 def throw_if_not_parsed_verbose(self):
ArticleException: Article `download()` failed with HTTPSConnectionPool(host='www.thedailystar.net', port=443): Read timed out. (read timeout=7) on URL http://www.thedailystar.net/ugc-asks-private-universities-stop-admissions-grades-without-test-for-coronavirus-pandemic-1890151
The quickest way to 'skip' failures related to the downloaded content is to use a try/except as follows:
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
except:
return False
Using an except to catch every possible exception, and ignore it, isn't recommended, and this answer would be downvoted if I didn't suggest that you deal with exceptions a little better. You did also ask about solving the issue. Without reading the documentation for libraries you import, you won't know what exceptions might occur, so printing out details of exceptions while you're skipping them will give you the details, like the ArticleException you are getting now. And you can start added individual except sections to deal with them for the ones you have already encountered:
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else:
return False
except ArticleException as ae:
print (ae)
return False
except Exception as e:
print(e)
return False
The ArticleException you are getting is telling you that you are getting a timeout error, which means the response from the Daily Star hasn't completed within a time limit. Maybe it's very busy :) You could try downloading several times before giving up.
I am using requests to resolve urls for about 410K check-in data. However, the process hang somewhere for hours and I am not sure where the problem is. I did the same thing for 1.7M pieces of data before and it worked well. Here is my code:
pat = re.compile("(?P<url>https?://[^\s]+)") # always compile it
def resolve_url(text):
url = 'before'
long_url = 'after'
error = 'none'
match = pat.search(text)
if match:
url = match.group("url")
try:
long_url = requests.head(url, allow_redirects=True).url
except requests.exceptions.RequestException as e:
error = e
return (url, long_url, error)
pool = multiprocessing.Pool(200)
resolved_urls = []
for i, res in enumerate(pool.imap(resolve_url, text_with_url)):
resolved_urls.append(res)
if i%10000 == 0 and i > 0:
print("%d elements have been processed, %2.5f seconds" %(i+1, time.time()-t0))
fout = open("./yangj/resolved_urls_%d_requests.pkl"%(i+1),"w")
pickle.dump(resolved_urls, fout)
fout.close()
resolved_urls = []
fout = open("./yangj/resolved_urls_last_requests.pkl","w")
pickle.dump(resolved_urls, fout)
fout.close()
I was wondering whether the problem is because of some exception that I need to write code to recover. I have looked through requests documents and previous similar questions but I didn't find matching answers. Any idea to solve the problem?