I've been reading tens of examples for similar issues, but I can't get any of the solutions I've seen or their variants to run. I'm screen scraping, and I just want to ignore 404 errors (skip the pages). I get
'AttributeError: 'module' object has no attribute 'HTTPError'.
I've tried 'URLError' as well. I've seen the near identical syntax accepted as working answers. Any ideas? Here's what I've got:
import urllib
import datetime
from bs4 import BeautifulSoup
class EarningsAnnouncement:
def __init__(self, Company, Ticker, EPSEst, AnnouncementDate, AnnouncementTime):
self.Company = Company
self.Ticker = Ticker
self.EPSEst = EPSEst
self.AnnouncementDate = AnnouncementDate
self.AnnouncementTime = AnnouncementTime
webBaseStr = 'http://biz.yahoo.com/research/earncal/'
earningsAnnouncements = []
dayVar = datetime.date.today()
for dte in range(1, 30):
currDay = str(dayVar.day)
currMonth = str(dayVar.month)
currYear = str(dayVar.year)
if (len(currDay)==1): currDay = '0' + currDay
if (len(currMonth)==1): currMonth = '0' + currMonth
dateStr = currYear + currMonth + currDay
webString = webBaseStr + dateStr + '.html'
try:
#with urllib.request.urlopen(webString) as url: page = url.read()
page = urllib.request.urlopen(webString).read()
soup = BeautifulSoup(page)
tbls = soup.findAll('table')
tbl6= tbls[6]
rows = tbl6.findAll('tr')
rows = rows[2:len(rows)-1]
for earn in rows:
earningsAnnouncements.append(EarningsAnnouncement(earn.contents[0], earn.contents[1],
earn.contents[3], dateStr, earn.contents[3]))
except urllib.HTTPError as err:
if err.code == 404:
continue
else:
raise
dayVar += datetime.timedelta(days=1)
It looks like for urllib (not urllib2) that the exception is urllib.error.HTTPError, not urllib.HTTPError. See the documentation for more information.
Do this :
import urllib.error# import
except urllib.error.URLError as e:# use 'urllib.error.URLError' and not 'urllib.HTTPError'
print ('Error code: ', e.code)# or what ever u want
return e.code
Related
Python code get stuck in the try block
`
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
#import urllib2
def url1_to_string(url1):
html=""
proxyDict = {
'http': 'http://username:pwd#proxyurl:8080',
'https': 'https://username:pwd#proxyurl:8080'
}
try:
print('Before res in try')
res = requests.get(url1,proxies=proxyDict)
print('After res in try')
except:
pass
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]
for index, row in df.iterrows():
agg_url_list = []
ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
ny_bb1 = url1_to_string(ini_url)
anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))
account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
print(anchor_count)
soup = BeautifulSoup(r.text,"html.parser")
get_details1 = soup.find_all("div", attrs={"class": "g"})
sublist1 = []
for details1 in get_details1:
link1 = details1.find_all("h3")
for mdetails1 in link1[:]:
links1 = mdetails1.find_all("a")
lmk1 = ""
for lnk1 in links1[:]:
lmk1 = lnk1.get("href")[7:].split("&")
sublist1.append(lmk1[0])
aggregate_count1=0
for x1 in sublist1[:3]:
anchorcount1=0
accountcount1=0
print("aagg url",x1)
try:
print('In try block')
ny_bb1 = url1_to_string(x1)
except KeyboardInterrupt: print('You cancelled the operation.')
finally:
pass
ny_bb1=ny_bb1.upper()
print(ny_bb1)
row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
if(anchor_name==account_name):
if(row['ANCHOR_NAME'] in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
else:
if (anchor_name in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(account_name in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
if(anchorcount1 > 0 and accountcount1 > 0):
aggregate_count1=aggregate_count1+1
agg_url_list.append(x1[:])
print("existance of both",aggregate_count1)
aggregate_page_count.append(aggregate_count1)
agg_url_count.append(agg_url_list)
df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)
`
The contents of the abc.csv file as follows ::
ANCHOR_NAME,ACCOUNT_NAME
ABC,ABC
XYZ,ZYZ
and so on
For particular URL's the code gets stuck in the try block and control does not come to except block where I want to ignore the exception and continue with normal program flow, as executing the next URL's and so on.
I want to produce a web crawler to download HTML from a website, but I don't know the re model very well, and have gotten stuck.
import urllib2
def download(url):
print("Downloading: " + url)
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print("Download error: ", e.reason)
html = None
return html
FIELD = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone',
'postal_code_format', 'postal_code_regex', 'languages', 'neighhbours')
import re
def re_scraper(html):
results = {}
for field in FIELD:
results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).group()
return results
import time
NUM_ITERATIONS = 1000
html = download("http://example.webscraping.com/view/Afghanistan-1")
for name, scraper in [('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper)]:
start = time.time()
for i in range(NUM_ITERATIONS):
if scraper == re_scraper:
re.purge()
result = scraper(html)
assert (result['area'] == '647,500 square kilometres')
end = time.time()
print('%s: %.2f seconds' % (name, end - start))
The error message:
File "E:/���/Projects/new.py", line 20, in re_scraper
results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).group()
AttributeError: 'NoneType' object has no attribute 'group'
The HTML is:
<tr id="places_area__row"><td class="w2p_fl"><label for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">647,500 square kilometres</td>
I have tested the code and finding HTML and regex is no problem. The problem may be with field or FIELD. I think their type my be causing this bug, but how can I fix it?
Okay, here is my code:
from lxml import html
from lxml import etree
from selenium import webdriver
import calendar
import math
import urllib
import progressbar
import requests
Using selenium
path_to_driver = '/home/vladislav/Shit/geckodriver'
browser = webdriver.Firefox(executable_path = path_to_driver)
Create a dict, where i store data and create progressbars
DataDict = {}
barY = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barM = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barW = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
Forming parameters in a loop, constructing a url from them and send a browser.get request
for year in (range(2014,2016)):
barY.update(year)
for month in range(1,13):
barM.update(month)
weeks = math.ceil(calendar.monthrange(year,month)[1]/4)
for week in range(weeks):
barW.update(week)
if (week > 2):
start_day = 22
end_day = calendar.monthrange(year,month)[1]
else:
start_day =7*week + 1
end_day = 7*(week + 1)
start_date = str(year) + '-' + str(month).zfill(2) +'-' + str(start_day).zfill(2)
end_date = str(year) + '-' +str(month).zfill(2) + '-' + str(end_day).zfill(2)
params = {'end-date': end_date, 'start-date': start_date}
url = 'http://www.finam.ru/profile/moex-akcii/aeroflot/news'
url = url + ('&' if urllib.parse.urlparse(url).query else '?') + urllib.parse.urlencode(params)
The request itself
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
print("Found! Adding news to the dictionary!")
except:
pass
But after 2-4 requests it just freezes:(
Whats the problem?
Okay, the problem was in an advertising banner, which appeared after several requests. Solution is just to wait (time.sleep), untill the banner disapeares, and the send request again!:
try:
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
time.sleep(10)
except:
print("perchaps this shitty AD?")
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
I'm trying to understand BeautifulSoup and tried want to find all the links within facebook.com and iterate each and every link within it...
Here is my code...it works fine but once it finds Linkedin.com and iterates over it, it get stuck at a point after this URL - http://www.linkedin.com/redir/redirect?url=http%3A%2F%2Fbusiness%2Elinkedin%2Ecom%2Ftalent-solutions%3Fsrc%3Dli-footer&urlhash=f9Nj
When I run Linkedin.com separately, I don't have any problem...
Could this be a limitation within my operating system..Im using Ubuntu Linux...
import urllib2
import BeautifulSoup
import re
def main_process(response):
print "Main process started"
soup = BeautifulSoup.BeautifulSoup(response)
limit = '5'
count = 0
main_link = valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$","http://www.facebook.com")
if main_link:
main_link = main_link.group(1)
print 'main_link = ', main_link
result = {}
result[main_link] = {'incoming':[],'outgoing':[]}
print 'result = ', result
for link in soup.findAll('a',href=True):
if count < 10:
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$",link.get('href'))
if valid_link:
#print 'Main link = ', link.get('href')
print 'Links object = ', valid_link.group(1)
connecting_link = valid_link.group(1)
connecting_link = connecting_link.encode('ascii')
if main_link <> connecting_link:
print 'outgoing link = ', connecting_link
result = add_new_link(connecting_link, result)
#Check if the outgoing is already added, if its then don't add it
populate_result(result,main_link,connecting_link)
print 'result = ', result
print 'connecting'
request = urllib2.Request(connecting_link)
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
for sublink in soup.findAll('a',href=True):
print 'sublink = ', sublink.get('href')
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$",sublink.get('href'))
if valid_link:
print 'valid_link = ', valid_link.group(1)
valid_link = valid_link.group(1)
if valid_link <> connecting_link:
populate_result(result,connecting_link,valid_link)
count += 1
print 'final result = ', result
# print 'found a url with national-park in the link'
def add_new_link(connecting_link, result):
result[connecting_link] = {'incoming':[],'outgoing':[]}
return result
def populate_result(result,link,dest_link):
if len(result[link]['outgoing']) == 0:
result[link]['outgoing'].append(dest_link)
else:
found_in_list = 'Y'
try:
result[link]['outgoing'].index(dest_link)
found_in_list = 'Y'
except ValueError:
found_in_list = 'N'
if found_in_list == 'N':
result[link]['outgoing'].append(dest_link)
return result
if __name__ == "__main__":
request = urllib2.Request("http://facebook.com")
print 'process start'
try:
response = urllib2.urlopen(request)
main_process(response)
except urllib2.URLError, e:
print "URLERROR"
print "program ended"
The problem is in hanging re.search() on certain URLs on this line:
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$", sublink.get('href'))
For example, it hangs on https://www.facebook.com/campaign/landing.php?placement=pflo&campaign_id=402047449186&extra_1=auto url:
>>> import re
>>> s = "https://www.facebook.com/campaign/landing.php?placement=pflo&campaign_id=402047449186&extra_1=auto"
>>> re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$", s)
hanging "forever"...
Looks like, it introduces a Catastrophic Backtracking case that causes regex search to hang.
One solution would be to use a different regex for validating the URL, see plenty of options here:
How do you validate a URL with a regular expression in Python?
Hope that helps.
part of code containing error:
select_link = db.GqlQuery("select * from PhishTank where url= :1",str(updated_url))
in_database_phishtank = False
for link in select_link:
if str(updated_url) == str(link.url):
in_database_phishtank = True
# chk for 7 days period , update the link
if (datetime.now()-link.timestamp) > timedelta(days = TIME_UPDATE):
# query to the site and update the datastore
url = "http://checkurl.phishtank.com/checkurl/"
parameters = {"url": "%s" % updated_url,
"app_key": "74283d86612c6b89de0b186882446e069dd071f65e9711aa374e9cdbd2ba7ffe",
"format":"json"}
data = urllib.urlencode(parameters)
req = urllib.Request(url, data)
try:
response = urllib2.urlopen(req)
except urllib.error.URLError as e:
self.redirect('/error')
json_post = response.read()
data = json.loads(json_post)
Try this:
urllib.request.Request(url, data)
Be aware that in Python 3.x urllib was split in several modules: urllib.request, urllib.parse, and urllib.error. It's possible that you're importing it wrong.