Beautifulsoup findall get stuck without processing - python

I'm trying to understand BeautifulSoup and tried want to find all the links within facebook.com and iterate each and every link within it...
Here is my code...it works fine but once it finds Linkedin.com and iterates over it, it get stuck at a point after this URL - http://www.linkedin.com/redir/redirect?url=http%3A%2F%2Fbusiness%2Elinkedin%2Ecom%2Ftalent-solutions%3Fsrc%3Dli-footer&urlhash=f9Nj
When I run Linkedin.com separately, I don't have any problem...
Could this be a limitation within my operating system..Im using Ubuntu Linux...
import urllib2
import BeautifulSoup
import re
def main_process(response):
print "Main process started"
soup = BeautifulSoup.BeautifulSoup(response)
limit = '5'
count = 0
main_link = valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$","http://www.facebook.com")
if main_link:
main_link = main_link.group(1)
print 'main_link = ', main_link
result = {}
result[main_link] = {'incoming':[],'outgoing':[]}
print 'result = ', result
for link in soup.findAll('a',href=True):
if count < 10:
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$",link.get('href'))
if valid_link:
#print 'Main link = ', link.get('href')
print 'Links object = ', valid_link.group(1)
connecting_link = valid_link.group(1)
connecting_link = connecting_link.encode('ascii')
if main_link <> connecting_link:
print 'outgoing link = ', connecting_link
result = add_new_link(connecting_link, result)
#Check if the outgoing is already added, if its then don't add it
populate_result(result,main_link,connecting_link)
print 'result = ', result
print 'connecting'
request = urllib2.Request(connecting_link)
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
for sublink in soup.findAll('a',href=True):
print 'sublink = ', sublink.get('href')
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$",sublink.get('href'))
if valid_link:
print 'valid_link = ', valid_link.group(1)
valid_link = valid_link.group(1)
if valid_link <> connecting_link:
populate_result(result,connecting_link,valid_link)
count += 1
print 'final result = ', result
# print 'found a url with national-park in the link'
def add_new_link(connecting_link, result):
result[connecting_link] = {'incoming':[],'outgoing':[]}
return result
def populate_result(result,link,dest_link):
if len(result[link]['outgoing']) == 0:
result[link]['outgoing'].append(dest_link)
else:
found_in_list = 'Y'
try:
result[link]['outgoing'].index(dest_link)
found_in_list = 'Y'
except ValueError:
found_in_list = 'N'
if found_in_list == 'N':
result[link]['outgoing'].append(dest_link)
return result
if __name__ == "__main__":
request = urllib2.Request("http://facebook.com")
print 'process start'
try:
response = urllib2.urlopen(request)
main_process(response)
except urllib2.URLError, e:
print "URLERROR"
print "program ended"

The problem is in hanging re.search() on certain URLs on this line:
valid_link = re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$", sublink.get('href'))
For example, it hangs on https://www.facebook.com/campaign/landing.php?placement=pflo&campaign_id=402047449186&extra_1=auto url:
>>> import re
>>> s = "https://www.facebook.com/campaign/landing.php?placement=pflo&campaign_id=402047449186&extra_1=auto"
>>> re.search("^(https?://(?:\w+.)+\.com)(?:/.*)?$", s)
hanging "forever"...
Looks like, it introduces a Catastrophic Backtracking case that causes regex search to hang.
One solution would be to use a different regex for validating the URL, see plenty of options here:
How do you validate a URL with a regular expression in Python?
Hope that helps.

Related

While running python code program flow gets stuck in try block

Python code get stuck in the try block
`
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
#import urllib2
def url1_to_string(url1):
html=""
proxyDict = {
'http': 'http://username:pwd#proxyurl:8080',
'https': 'https://username:pwd#proxyurl:8080'
}
try:
print('Before res in try')
res = requests.get(url1,proxies=proxyDict)
print('After res in try')
except:
pass
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]
for index, row in df.iterrows():
agg_url_list = []
ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
ny_bb1 = url1_to_string(ini_url)
anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))
account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
print(anchor_count)
soup = BeautifulSoup(r.text,"html.parser")
get_details1 = soup.find_all("div", attrs={"class": "g"})
sublist1 = []
for details1 in get_details1:
link1 = details1.find_all("h3")
for mdetails1 in link1[:]:
links1 = mdetails1.find_all("a")
lmk1 = ""
for lnk1 in links1[:]:
lmk1 = lnk1.get("href")[7:].split("&")
sublist1.append(lmk1[0])
aggregate_count1=0
for x1 in sublist1[:3]:
anchorcount1=0
accountcount1=0
print("aagg url",x1)
try:
print('In try block')
ny_bb1 = url1_to_string(x1)
except KeyboardInterrupt: print('You cancelled the operation.')
finally:
pass
ny_bb1=ny_bb1.upper()
print(ny_bb1)
row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
if(anchor_name==account_name):
if(row['ANCHOR_NAME'] in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
else:
if (anchor_name in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(account_name in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
if(anchorcount1 > 0 and accountcount1 > 0):
aggregate_count1=aggregate_count1+1
agg_url_list.append(x1[:])
print("existance of both",aggregate_count1)
aggregate_page_count.append(aggregate_count1)
agg_url_count.append(agg_url_list)
df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)
`
The contents of the abc.csv file as follows ::
ANCHOR_NAME,ACCOUNT_NAME
ABC,ABC
XYZ,ZYZ
and so on
For particular URL's the code gets stuck in the try block and control does not come to except block where I want to ignore the exception and continue with normal program flow, as executing the next URL's and so on.

Python check if website exists for a list of websites

I want to check if a website exists, given a list of websites in the format XXXXX.com, where XXXXX=a 5 digit number. So I want to go through from 00000 up to 99999 and see if those variants of the website exist.
I want to do something like
import requests
request = requests.get('http://www.example.com')
if request.status_code == 200:
print('Web site exists')
else:
print('Web site does not exist')
But generate a list of some sort (or even just export a list to csv), so for each URL, i know if it exists or not.
Any advice would be great!
I'm going to make an assumption that you have a large list of URLs and you want to read them in from some source file, let's say a text file, rather than hard-coding a large list of URLs in Python file, right. If that's the case, run the script below and you'll get what you want.
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('C:\\your_path\\check_me.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Try combining xrange and the string zfill method in a loop.
import requests
def test_for_200(url):
req = requests.get(url)
return req.status_code == 200
def numbers():
for n in xrange(100000):
yield str(n).zfill(5)
results = {}
for num in numbers():
url = "http://{}.com".format(num)
results[num] = test_for_200(url)
results will look something like this:
>>> results
{'00000': True, '00001': False, ...}

how do deal with special characters in html

I am reading in some xml data and in particular I have the following string
H.P. Dembinski, B. K\'{e}gl, I.C. Mari\c{s}, M. Roth, D. Veberi\v{c}
this is latex notation. I am using mathjax, but without the $ sign mathjax does not recognize this text. So this text is shown in my browser as printed above.
I am reading in xml data with the following code
today = some date
base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
url = (base_url + "from=%s&until=%s&" % (today, today) + "metadataPrefix=arXivRaw")
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError, e:
return
rawdata = response.read()
root = ET.fromstring(rawdata)
if root.find(OAI+'ListRecords') is not None:
for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
author_string = info.find(ARXIVRAW+"authors").text
I can read the same text using feedparser and in that case I get
u'H. P. Dembinski, B. K\xe9gl, I. C. Mari\u015f, M. Roth, D. Veberi\u010d'
which my browser can correctly interpret with all the special characters. Here is my feedparser solution
url = 'some url'
response = urllib.urlopen(url).read().decode('latin-1')
feed = feedparser.parse(response)
for entry in feed.entries:
data = {}
try:
data['authors'] = ', '.join(author.name for author in entry.authors)
except AttributeError:
data['authors'] = ''
how do I have to change my ElementTree solution (the first one) to get the same string as from the feedparser solution?
EDIT: Here is a working piece of code which produces the unwanted result
import urllib2
from itertools import ifilter
import xml.etree.ElementTree as ET
import feedparser
OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"
ARXIVRAW = "{http://arxiv.org/OAI/arXivRaw/}"
def main():
url = "http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:1503.09027&metadataPrefix=arXivRaw"
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError, e:
return
rawdata = response.read().decode('latin-1')
root = ET.fromstring(rawdata)
record = root.find(OAI+'GetRecord').findall(OAI+"record")
meta = record[0].find(OAI+'metadata')
info = meta.find(ARXIVRAW+"arXivRaw")
print "author = ", info.find(ARXIVRAW+"authors").text
base_url = 'http://export.arxiv.org/api/query?'
search_query = 'id:1503.09027'
max_results = 2000
sortBy = 'submittedDate'
sortOrder = 'ascending'
query = 'search_query=%s&max_results=%i&sortBy=%s&sortOrder=%s' % (search_query, max_results, sortBy, sortOrder)
response = urllib2.urlopen(base_url+query).read().decode('latin-1')
feed = feedparser.parse(response)
for entry in feed.entries:
print "entry.authors = ", entry.authors
if __name__ == "__main__":
main()
output:
python test.py
author = H.P. Dembinski, B. K\'{e}gl, I.C. Mari\c{s}, M. Roth, D. Veberi\v{c}

Dictionary python specific key

I have a code which scrape out everything from a specific web page, I now want to build a code which can help me to know the specific details, for example if I enter style id, it should give me the details related to it, or if I enter category, it should give me all the items in that category with their details. My code is:-
import requests, re
from bs4 import BeautifulSoup
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links=soup.find_all("a")
img=soup.find(itemprop="image")
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
pattern_2=re.compile("clothing/(\w+)")
for link in links_2:
match_1=pattern_2.search(link["href"])
if match_1:
print ("Category:- " + match_1.group(1))
break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
try:
print ("\n\nBRAND:-" + item.contents[1].text)
except:
pass
try:
a_1=item.find("ol", {"class":"breadcrumb"})
a_2=a_1.text
print a_2
except:
pass
try:
print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
except:
pass
try:
d2=item.find("div",{"class":"panel-body standard-p"})
d3=d2.text
p_id=re.findall(r'[0-9]{9}',d3)
id_2=p_id[0]
url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
r_1= requests.get(url_1)
pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
product_ids = pattern.findall(str(r_1.content))
print ("DETAILS:- " + d3+';')
print ("\nStyle ID:- " + id_2+';')
print ("\nRecommended Product ID's:- ")
print (','.join(i for i in product_ids))
except:
pass
try:
print ("\nURL:-" + img["src"]+';')
except:
pass
try:
print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
except:
pass
try:
print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
except:
pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
for link in links_1[1:]:
match=pattern_1.search(link["href"])
if match:
print ("\nProduct ID of other color:-")
print (match.group(1))
I added a dictionary called d
import requests, re
from bs4 import BeautifulSoup
d={}
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links = soup.find_all("a")
d["links"] = []
d["links"].append(("href", [link.get("href") for link in links]))
d["links"].append(("class", [link.get("class") for link in links]))
img=soup.find(itemprop="image")
d["img"] = []
d["img"].append([("alt", img.get("alt")), ("src", img.get("src")), ("itemprop", img.get("itemprop")), ("class", img.get("class")[0])]) #You will have to put d["img"]["0"] instead of d["img"]["alt"]
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
pattern_2=re.compile("clothing/(\w+)")
for link in links_2:
match_1=pattern_2.search(link["href"])
if match_1:
print ("Category:- " + match_1.group(1))
break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
try:
d["Brand"] = item.contents[1].text
print ("\n\nBRAND:-" + item.contents[1].text)
except:
pass
try:
a_1=item.find("ol", {"class":"breadcrumb"})
a_2=a_1.text
d["a_2"] = a_2
print a_2
except:
pass
try:
print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
d["Type"] = item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text
except:
pass
try:
d2=item.find("div",{"class":"panel-body standard-p"})
d3=d2.text
p_id=re.findall(r'[0-9]{9}',d3)
id_2=p_id[0]
url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
r_1= requests.get(url_1)
pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
product_ids = pattern.findall(str(r_1.content))
print ("DETAILS:- " + d3+';')
d["Details"] = d3.split(",")
print ("\nStyle ID:- " + id_2+';')
d["Style"] = ("ID", id_2)
print ("\nRecommended Product ID's:- ")
print (','.join(i for i in product_ids))
d["RecommendedProductIDs"] = [i for i in product_ids]
except:
pass
try:
print ("\nURL:-" + img["src"]+';')
except:
pass
try:
print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
except:
pass
try:
print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
except:
pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
for link in links_1[1:]:
match=pattern_1.search(link["href"])
if match:
print ("\nProduct ID of other color:-")
print (match.group(1))

How do I catch a 404 error in urllib? (python 3)

I've been reading tens of examples for similar issues, but I can't get any of the solutions I've seen or their variants to run. I'm screen scraping, and I just want to ignore 404 errors (skip the pages). I get
'AttributeError: 'module' object has no attribute 'HTTPError'.
I've tried 'URLError' as well. I've seen the near identical syntax accepted as working answers. Any ideas? Here's what I've got:
import urllib
import datetime
from bs4 import BeautifulSoup
class EarningsAnnouncement:
def __init__(self, Company, Ticker, EPSEst, AnnouncementDate, AnnouncementTime):
self.Company = Company
self.Ticker = Ticker
self.EPSEst = EPSEst
self.AnnouncementDate = AnnouncementDate
self.AnnouncementTime = AnnouncementTime
webBaseStr = 'http://biz.yahoo.com/research/earncal/'
earningsAnnouncements = []
dayVar = datetime.date.today()
for dte in range(1, 30):
currDay = str(dayVar.day)
currMonth = str(dayVar.month)
currYear = str(dayVar.year)
if (len(currDay)==1): currDay = '0' + currDay
if (len(currMonth)==1): currMonth = '0' + currMonth
dateStr = currYear + currMonth + currDay
webString = webBaseStr + dateStr + '.html'
try:
#with urllib.request.urlopen(webString) as url: page = url.read()
page = urllib.request.urlopen(webString).read()
soup = BeautifulSoup(page)
tbls = soup.findAll('table')
tbl6= tbls[6]
rows = tbl6.findAll('tr')
rows = rows[2:len(rows)-1]
for earn in rows:
earningsAnnouncements.append(EarningsAnnouncement(earn.contents[0], earn.contents[1],
earn.contents[3], dateStr, earn.contents[3]))
except urllib.HTTPError as err:
if err.code == 404:
continue
else:
raise
dayVar += datetime.timedelta(days=1)
It looks like for urllib (not urllib2) that the exception is urllib.error.HTTPError, not urllib.HTTPError. See the documentation for more information.
Do this :
import urllib.error# import
except urllib.error.URLError as e:# use 'urllib.error.URLError' and not 'urllib.HTTPError'
print ('Error code: ', e.code)# or what ever u want
return e.code

Categories

Resources