ArticleException error in web scraping news articles by python - python

I am trying to web scrape news articles by certain keywords. I use Python 3. However, I am not able to get all the articles from the newspaper. After scraping some articles as output in the csv file I get ArticleException error. Could anyone help me with this? Ideally, I would like to solve the problem and download all the related articles from the newspaper website. Otherwise, it would also be useful to just skip the URL that shows error and continue from the next one. Thanks in advance for your help.
This is the code I am using:
import urllib.request
import newspaper
from newspaper import Article
import csv, os
from bs4 import BeautifulSoup
import urllib
req_keywords = ['coronavirus', 'covid-19']
newspaper_base_url = 'http://www.thedailystar.net'
category = 'country'
def checkif_kw_exist(list_one, list_two):
common_kw = set(list_one) & set(list_two)
if len(common_kw) == 0: return False, common_kw
else: return True, common_kw
def get_article_info(url):
a = Article(url)
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
output_file = "J:/B/output.csv"
if not os.path.exists(output_file):
open(output_file, 'w').close()
for index in range(1,50000,1):
page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())
primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})
for tag in primary_tag:
url = tag.find("a")
#print (url)
url = newspaper_base_url + url.get('href')
result = get_article_info(url)
if result is not False:
with open(output_file, 'a', encoding='utf-8') as f:
writeFile = csv.writer(f)
writeFile.writerow(result)
f.close
else:
pass
This is the error I am getting:
---------------------------------------------------------------------------
ArticleException Traceback (most recent call last)
<ipython-input-1-991b432d3bd0> in <module>
65 #print (url)
66 url = newspaper_base_url + url.get('href')
---> 67 result = get_article_info(url)
68 if result is not False:
69 with open(output_file, 'a', encoding='utf-8') as f:
<ipython-input-1-991b432d3bd0> in get_article_info(url)
28 a = Article(url)
29 a.download()
---> 30 a.parse()
31 a.nlp()
32 success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
~\Anaconda3\lib\site-packages\newspaper\article.py in parse(self)
189
190 def parse(self):
--> 191 self.throw_if_not_downloaded_verbose()
192
193 self.doc = self.config.get_parser().fromstring(self.html)
~\Anaconda3\lib\site-packages\newspaper\article.py in throw_if_not_downloaded_verbose(self)
530 elif self.download_state == ArticleDownloadState.FAILED_RESPONSE:
531 raise ArticleException('Article `download()` failed with %s on URL %s' %
--> 532 (self.download_exception_msg, self.url))
533
534 def throw_if_not_parsed_verbose(self):
ArticleException: Article `download()` failed with HTTPSConnectionPool(host='www.thedailystar.net', port=443): Read timed out. (read timeout=7) on URL http://www.thedailystar.net/ugc-asks-private-universities-stop-admissions-grades-without-test-for-coronavirus-pandemic-1890151

The quickest way to 'skip' failures related to the downloaded content is to use a try/except as follows:
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
except:
return False
Using an except to catch every possible exception, and ignore it, isn't recommended, and this answer would be downvoted if I didn't suggest that you deal with exceptions a little better. You did also ask about solving the issue. Without reading the documentation for libraries you import, you won't know what exceptions might occur, so printing out details of exceptions while you're skipping them will give you the details, like the ArticleException you are getting now. And you can start added individual except sections to deal with them for the ones you have already encountered:
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else:
return False
except ArticleException as ae:
print (ae)
return False
except Exception as e:
print(e)
return False
The ArticleException you are getting is telling you that you are getting a timeout error, which means the response from the Daily Star hasn't completed within a time limit. Maybe it's very busy :) You could try downloading several times before giving up.

Related

snscrape twitter using Python

I just got an information that sncrape python still error, I tried to solve it by add top = True in line twi***scraper but it still error.
Here is my codes:
pd.options.display.max_colwidth = 500
query = "(music) lang:en since:2023-01-01 until:2023-02-02"
tweets = []
limit = 10
get_ipython().run_line_magic('time', '')
try:
print("start scraping")
for tweet in sntwitter.TwitterSearchScraper(query=query, top = True).get_items():
if len(tweets) == limit :
break
else:
tweets.append([tweet.date, tweet.user.username, tweet.content])
df = pd.DataFrame(tweets, columns=['datetime', 'username', 'content'])
except Exception as e:
print(e)
print("Finished")
print("-------")
Can somebody solved the error?
Error retrieving https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%28Gempa%29+lang%3Aid++since%3A2023-01-01+until%3A2023-02-02&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel: non-200 status code
4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%28Gempa%29+lang%3Aid++since%3A2023-01-01+until%3A2023-02-02&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel failed, giving up.

2Captcha: How can I reset after getting ERROR_CAPTCHA_UNSOLVABLE?

I've created a script for web scraping and I'm using 2Captcha to solve captchas. 2Captcha has a Python library, but I've created my own functions to generate the captcha ID and captcha token code.
My captcha module has 3 functions: get_captcha_id(), get_captcha_response(), and apply_token()
Everything works great, and I'm able to sovled a couple dozen captchas until eventually I get the 2 following error:
ERROR_WRONG_CAPTCHA_ID
When this happens, the script first comes to the error ERROR_CAPTCHA_UNSOLVABLE, then the loop goes back and generates an entire new captcha ID. Maybe I should keep the same ID and just generate a new token?
I just want to know if there's a better way to do this anyway...
Here is the code to start the 2Captcha on my main script:
captcha_solved = 0
#Solves recpacha via 2Captcha API
while captcha_solved == 0:
captcha_id = captcha.get_captcha_id(browser.current_url)
if captcha_id != 0 or captcha_id != None:
print("Captcha ID is: "+str(captcha_id))
cap_res = captcha.get_captcha_response(captcha_id)
if cap_res == "ERROR_CAPTCHA_UNSOLVABLE" or cap_res == "ERROR_TOKEN_EXPIRED" or cap_res == "ERROR_WRONG_CAPTCHA_ID":
print("Captcha failed... Restarting captcha")
browser.refresh()
sleep(1)
continue
else:
print("Capcha Token: "+cap_res)
captcha.apply_token(browser, cap_res)
solver.report(captcha_id, True)
captcha_solved = captcha_solved + 1
break
Once this while loop is complete, the main script will start. After about 2 dozen captcha or so, I'll receive this error:
Traceback (most recent call last):
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\main.py", line 191, in <module>
cap_res = captcha.get_captcha_response(captcha_id)
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\captcha.py", line 83, in get_captcha_response
solver.report(cap_id, False)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\solver.py", line 496, in report
self.api_client.res(key=self.API_KEY, action=rep, id=id_)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\api.py", line 113, in res
raise ApiException(resp)
twocaptcha.api.ApiException: ERROR_WRONG_CAPTCHA_ID
I've thought I added enough failsafes to be able to regenerate a Captcha Token
Here is my captcha.py file code:
from twocaptcha import TwoCaptcha
from random import randint
from time import sleep
from urllib.request import urlopen, Request
import re
from bs4 import BeautifulSoup
from twocaptcha.solver import ValidationException
from twocaptcha.api import NetworkException, ApiException
from selenium.common.exceptions import TimeoutException
#solver = TwoCaptcha('API_KEY')
site_key = "###"
api_key = "###"
config = {
'server': '2captcha.com',
'apiKey': api_key,
'callback': 'https://your.site.com/',
'defaultTimeout': 120,
'recaptchaTimeout': 600,
'pollingInterval': 10,
}
proxy={
'type': 'HTTP',
'uri': '###'
}
user_agent = '###'
solver = TwoCaptcha(**config)
print("2Captcha Balance: $"+str(solver.balance()))
def get_captcha_id(captcha_url):
try:
result = solver.recaptcha(sitekey=site_key, url=captcha_url, proxy=proxy)
#print(result)
split_string = str(result).split(":", 1)
substring = split_string[0]
#print(substring)
if (substring == "{'captchaId'"):
strip_beginning = re.sub("{'captchaId': '", "", str(result))
captcha_id = re.sub("'}", "", strip_beginning)
return captcha_id
else:
print("could not find captcha ID")
return 0
except ValidationException as e:
# invalid parameters passed
print(e)
return e
except NetworkException as e:
# network error occurred
print(e)
return e
except ApiException as e:
# api respond with error
print(e)
return e
except TimeoutException as e:
# captcha is not solved so far
print(e)
return e
def get_captcha_response(cap_id):
capcha_ready = 0
response_url = "https://2captcha.com/res.php?key="+api_key+"&action=get&id="+cap_id
while capcha_ready == 0:
PageRequest = Request(response_url,data=None,headers={'User-Agent': user_agent})
PageResponse = urlopen(PageRequest)
PageHtml = PageResponse.read()
PageSoup = BeautifulSoup(PageHtml, 'html.parser')
SoupText = str(PageSoup)
if SoupText == "ERROR_CAPTCHA_UNSOLVABLE" or SoupText == "ERROR_WRONG_CAPTCHA_ID" or SoupText == "ERROR_TOKEN_EXPIRED":
solver.report(cap_id, False)
return SoupText
elif str(PageSoup) == "CAPCHA_NOT_READY":
print("Waiting for capcha response...")
rand = randint(12,18)
print("sleeping for "+str(rand)+" seconds")
sleep(rand)
else:
split_string = str(PageSoup).split("|", 1)
if len(split_string) > 0:
substring = split_string[1]
return substring
capcha_ready = capcha_ready + 1
#print(PageSoup)
return PageSoup
def apply_token(browser, token):
print("Applying token to browser...")
browser.execute_script('document.getElementById("g-recaptcha-response").innerHTML = "{}";'.format(token))
print("Token applied")
Thanks for your help for this, I really appreciate it!

Python: requests hang for hours

I am using requests to resolve urls for about 410K check-in data. However, the process hang somewhere for hours and I am not sure where the problem is. I did the same thing for 1.7M pieces of data before and it worked well. Here is my code:
pat = re.compile("(?P<url>https?://[^\s]+)") # always compile it
def resolve_url(text):
url = 'before'
long_url = 'after'
error = 'none'
match = pat.search(text)
if match:
url = match.group("url")
try:
long_url = requests.head(url, allow_redirects=True).url
except requests.exceptions.RequestException as e:
error = e
return (url, long_url, error)
pool = multiprocessing.Pool(200)
resolved_urls = []
for i, res in enumerate(pool.imap(resolve_url, text_with_url)):
resolved_urls.append(res)
if i%10000 == 0 and i > 0:
print("%d elements have been processed, %2.5f seconds" %(i+1, time.time()-t0))
fout = open("./yangj/resolved_urls_%d_requests.pkl"%(i+1),"w")
pickle.dump(resolved_urls, fout)
fout.close()
resolved_urls = []
fout = open("./yangj/resolved_urls_last_requests.pkl","w")
pickle.dump(resolved_urls, fout)
fout.close()
I was wondering whether the problem is because of some exception that I need to write code to recover. I have looked through requests documents and previous similar questions but I didn't find matching answers. Any idea to solve the problem?

Fetching the first image from a website that belongs to the post

I've written a program that fetches the desired information from a blog or any page. The next thing, I want to achieve is to retrieve the first image from that page, that belongs to the respective post (Just like Facebook does when a post is shared).
I was able to achieve this to some extent by fetching the first image with an alt tag (since many websites don't have alt tags in their logos and icons etc, the first one should belong to the post). But this does not seem to work in some cases. Is there any other (better) way to achieve this?
I'm using python 2.7.9 and BeautifulSoup 4.
d = feedparser.parse('http://rss.cnn.com/rss/edition.rss')
for entry in d.entries:
try:
if entry.title is not None:
print entry.title
print ""
except Exception, e:
print e
try:
if entry.link is not None:
print entry.link
print ""
except Exception, e:
print e
try:
if entry.published[5:16] is not None:
print entry.published[5:16]
print ""
except Exception, e:
print e
try:
if entry.category is not None:
print entry.category
print ""
except Exception, e:
print e
try:
if entry.get('summary', '') is not None:
print entry.get('summary', '')
print ""
except Exception, e:
print e
time.sleep(5)
r = requests.get(entry.link, headers = {'User-Agent' : 'Safari/534.55.3 '})
soup = BeautifulSoup(r.text, 'html.parser')
for img in soup.findAll('img'):
if img.has_attr('alt'):
if img['src'].endswith('.jpg') == True or img['src'].endswith('.png') == True:
print img['src']
break
It is probably more practical to take a look at the opengraph module:
https://pypi.python.org/pypi/opengraph/0.5
and correct it the way you like.
It will fetch "first image" from HTML code or use og:image.
If you want to learn, you can also do it by looking at the source code. The module uses BeautifulSoup too.
I needed the following monkeypatch to activate scraping as fallback:
import re
from bs4 import BeautifulSoup
from opengraph import OpenGraph
def parser(self, html):
"""
"""
if not isinstance(html,BeautifulSoup):
doc = BeautifulSoup(html, from_encoding='utf-8')
else:
doc = html
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
self[og[u'property'][3:]]=og[u'content']
# Couldn't fetch all attrs from og tags, try scraping body
if not self.is_valid() and self.scrape:
for attr in self.required_attrs:
if not hasattr(self, attr):
try:
self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
except AttributeError:
pass
OpenGraph.parser = parser
OpenGraph.scrape = True # workaround for some subtle bug in opengraph
You may need to handle relatives URLs in the image sources, but it is quite straightforward with use of urljoin from urlparse
import opengraph
...
page = opengraph.OpenGraph(url=link, scrape=True)
...
if page.is_valid():
...
image_url = page.get('image', None)
...
if not image_url.startswith('http'):
image_url = urljoin(page['_url'], page['image'])
(some check are omitted for brevity from the code fragment)

How to modify this script to check for HTTP status (404, 200)

I am currently using the following script to load a list of URLs then check the source of each for a list of error strings. If no error string is found in the source, the URL is considered valid and written to a text file.
How can I modify this script to check for HTTP status instead? If a URL returns a 404 it would be ignored, if it returns 200 the URL would be written to the text file. Any help would be much appreciated.
import urllib2
import sys
error_strings = ['invalid product number', 'specification not available. please contact customer services.']
def check_link(url):
if not url:
return False
f = urllib2.urlopen(url)
html = f.read()
result = False
if html:
result = True
html = html.lower()
for s in error_strings:
if s in html:
result = False
break
return result
if __name__ == '__main__':
if len(sys.argv) == 1:
print 'Usage: %s <file_containing_urls>' % sys.argv[0]
else:
output = open('valid_links.txt', 'w+')
for url in open(sys.argv[1]):
if(check_link(url.strip())):
output.write('%s\n' % url.strip());
output.flush()
output.close()
You can alter your call to urlopen slightly:
>>> try:
... f = urllib2.urlopen(url)
... except urllib2.HTTPError, e:
... print e.code
...
404
Utilizing the e.code, you can check if it 404s on you. If you don't hit the except block, you can utilize f as you currently do.
urlib2.urlopen gives back a file-like object with some other methods, one of which: getcode() is what you're looking for, just add a line:
if f.getcode() != 200:
return False
In the relevant place
Try this. You can use this
def check_link(url):
if not url:
return False
code = None
try:
f = urllib2.urlopen(url)
code = f.getCode()
except urllib2.HTTPError, e:
code = e.code
result = True
if code != 200:
result = False
return result
Alternatively, if you just need to maintain a list of invalid code strings and check against those, it will be something like below.
def check_link(url):
if not url:
return False
code = None
try:
f = urllib2.urlopen(url)
code = f.getCode()
except urllib2.HTTPError, e:
code = e.code
result = True
if code in invalid_code_strings:
result = False
return result

Categories

Resources