My web crawler doesn't work with BeautifulSoup - python

I am trying to make a web crawler using Python. I am borrowing this code from Programming Collective intelligence book by Toby Segaran. Since the code from the book was outdated, I made some necessary changes but still the program doesn't execute as expected. Here is my code:
import urllib
from urllib import request
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import bs4
# Create a list of words to ignore
ignorewords=set(['the','of','to','and','a','in','is','it'])
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
pass
def __del__(self): pass
def dbcommit(self):
pass
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
return None
# Index an individual page
def addtoindex(self,url,soup):
print('Indexing %s' % url)
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
return None
# Separate the words by any non-whitespace character
def separatewords(self,text):
return None
# Return true if this url is already indexed
def isindexed(self,url):
return False
# Add a link between two pages
def addlinkref(self,urlFrom,urlTo,linkText):
pass
# Starting with a list of pages, do a breadth
# first search to the given depth, indexing pages
# as we go
def crawl(self,pages,depth=2):
pass
# Create the database tables
def createindextables(self):
pass
def crawl(self,pages,depth=2):
for i in range(depth):
newpages=set( )
for page in pages:
try:
c=request.urlopen(page)
except:
print("Could not open %s" % page)
continue
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit( )
pages=newpages
pagelist=['http://google.com']
#pagelist=['file:///C:/Users/admin/Desktop/abcd.html']
crawler=crawler('')
crawler.crawl(pagelist)
the only output I get is
"Indexing http://google.com"
"Indexing http://google.com"
press any key to continue...
Everytime I put another link in page list I get same output as "Indexing xyz" where xyz is every link I put in pagelist. I also tried making a HTML file with lots of <a> tags but it didn't work too.

The problem is in your line link=soup('a'). If you want to find elements of class 'a', you should use the different methods named 'find_element_by...' (cf bs4 documentation)

Related

How to use BeautifulSoup to find specific class elements on a web page

Goal: To perform a web search that looks up a business and from the results, looks for either a "Permanently Closed" text or "Open" with hours or basically anything BUT "Permanently closed."
Problem: I'm using BeautifulSoup to parse the search results, but it only seems to find the correct element by class 50% of the time.
import urllib as u
from bs4 import BeautifulSoup as bs
import time
from PIL import Image
from io import BytesIO, StringIO
comp = pandas.DataFrame(data=[['ALL CITY FITNESS 2', '1005 E PESCADERO AVE SITE 211', 'TRACY', 'CA', '', '']],
columns=['NAME','ADDRESS','CITY','STATE','VERIFIED','STATUS'])
for i in comp.index:
if comp.loc[i, 'VERIFIED'] != 'YES':
location, address, city, state = comp.loc[i, ['NAME', 'ADDRESS', 'CITY', 'STATE']]
print(location, address, city, state)
search_string = f'{location} {address} {city}, {state}'
# search_html = Str(search_string).htmlconvert() # This is a custom function
search_html = 'ALL%20CITY%20FITNESS%202%201005%20E%20PESCADERO%20AVE%20SITE%20211%20TRACY%2C%20CA'
url = f'https://www.bing.com/search?q={search_html}'
try:
req = u.request.urlopen(url)
soup = bs(req, "xml")
# This checks if there is a Permanently Closed indicator on the page
# This works pretty consistently
for item in soup.find_all(class_='b_alert'):
print(item.text)
# Mark Location as closed
comp.loc[i, 'STATUS'] = 'INACTIVE'
else:
# This however, and the one below it rarely work
for check in soup.find_all(class_='e_green b_positive'):
print(check.text)
for check in soup.find_all('span', class_='e_green b_positive'):
print(check.text)
comp.loc[i, 'VERIFIED'] = 'YES'
time.sleep(3)
except Exception as e:
errors.append([i, search_string, e])
print(comp)
I performed this search manually and inspected the element, which is where I retrieved this class name. I've tried adding the '.' so that it was 'e_green.b_positive' and also removed it, as shown above. Neither seem to work, or at least don't work 100% of the time. What do I have wrong with my syntax where this is getting missed?
I'm not sure why this affects it but it actually has to do with how you're encoding your html, or rather the end format of your html that you're using to run the search.
Add '&qs=n&form=QBRE&=%25eManage%20Your%20Search%20History%25E&sp=-1&p' to the end of your url variable, and I bet your code will find those class items now.

unable to save data in datastore but no errors

I'm building a web crawler. some of the the data I input into datastore get saved, others do not get saved and I have no idea what is the problem.
here is my crawler class
class Crawler(object):
def get_page(self, url):
try:
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"}) # yessss!!! with the header, I am able to download pages
#response = urlfetch.fetch(url, method='GET')
#return response.content
#except urlfetch.InvalidURLError as iu:
# return iu.message
response = urllib2.urlopen(req)
return response.read()
except urllib2.HTTPError as e:
return e.reason
def get_all_links(self, page):
return re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',page)
def union(self, lyst1, lyst2):
try:
for elmt in lyst2:
if elmt not in lyst1:
lyst1.append(elmt)
return lyst1
except e:
return e.reason
#function that crawls the web for links starting from the seed
#returns a dictionary of index and graph
def crawl_web(self, seed="http://tonaton.com/"):
query = Listings.query() #create a listings object from storage
if query.get():
objListing = query.get()
else:
objListing = Listings()
objListing.toCrawl = [seed]
objListing.Crawled = []
start_time = datetime.datetime.now()
while datetime.datetime.now()-start_time < datetime.timedelta(0,5):#tocrawl (to crawl can take forever)
try:
#while True:
page = objListing.toCrawl.pop()
if page not in objListing.Crawled:
content = self.get_page(page)
add_page_to_index(page, content)
outlinks = self.get_all_links(content)
graph = Graph() #create a graph object with the url
graph.url = page
graph.links = outlinks #save all outlinks as the value part of the graph url
graph.put()
self.union(objListing.toCrawl, outlinks)
objListing.Crawled.append(page)
except:
return False
objListing.put() #save to database
return True #return true if it works
the classes that define the various ndb Models are in this python module:
import os
import urllib
from google.appengine.ext import ndb
import webapp2
class Listings(ndb.Model):
toCrawl = ndb.StringProperty(repeated=True)
Crawled = ndb.StringProperty(repeated=True)
#let's see how this works
class Index(ndb.Model):
keyword = ndb.StringProperty() # keyword part of the index
url = ndb.StringProperty(repeated=True) # value part of the index
#class Links(ndb.Model):
# links = ndb.JsonProperty(indexed=True)
class Graph(ndb.Model):
url = ndb.StringProperty()
links = ndb.StringProperty(repeated=True)
it used to work fine when I had JsonProperty in place of StringProperty(repeated=true). but JsonProperty is limited to 1500 bytes so I had an error once.
now, when I run the crawl_web member function, it actually crawls but when I check datastore it's only the Index entity that is created. No Graph, no Listing. please help. thanks.
Putting your code together, adding the missing imports, and logging the exception, eventually shows the first killer problem:
Exception Indexed value links must be at most 500 characters
and indeed, adding a logging of outlinks, one easily eyeballs that several of them are far longer than 500 characters -- therefore they can't be items in an indexed property, such as a StringProperty. Changing each repeated StringProperty to a repeated TextProperty (so it does not get indexed and thus has no 500-characters-per-item limitation), the code runs for a while (making a few instances of Graph) but eventually dies with:
An error occured while connecting to the server: Unable to fetch URL: https://sb':'http://b')+'.scorecardresearch.com/beacon.js';document.getElementsByTagName('head')[0].appendChild(s); Error: [Errno 8] nodename nor servname provided, or not known
and indeed, it's pretty obvious tht the alleged "link" is actually a bunch of Javascript and as such cannot be fetched.
So, essentially, the core bug in your code is not at all related to app engine, but rather, the issue is that your regular expression:
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
does not properly extract outgoing links given a web page containing Javascript as well as HTML.
There are many issues with your code, but to this point they're just slowing it down or making it harder to understand, not killing it -- what's killing it is using that regular expression pattern to try and extract links from the page.
Check out retrieve links from web page using python and BeautifulSoup -- most answers suggest, for the purpose of extracting links from a page, using BeautifulSoup, which may perhaps be a problem in app engine, but one shows how to do it with just Python and REs.

Getting the title using urllib

I am supposed to write a code that goes into a web site and gets its title so here is the code i have
import urllib.request
def findTitle(url):
urllib.request.Request(url)
#open url
urllib.request.urlopen(url)
urllib.request.urlopen(url).read().decode('utf-8')
#set same variable equal to the end of <title> tag
endTitlePos = url.find("<title>")
#set variable equal to starting position of <title> tag
startTitlePos = url.find("<title>", endTitlePos)
startTitlePos += len("<title>")
#set new variable equal to </title>
TitleContent=url.find("</title>",startTitlePos)
#return slice of output between the two variables
title = url[startTitlePos:endTitlePos]
content_list=[]
content_list.append(title)
return content_list
def main():
url="https://google.com/search"
print(findTitle(url))
main()
we are using google for an example. Now its supposed to just print "google" but currently it prints "['//google.com/searc']" i am just curious what i am missing here, i mean it seems very simple but i dont know why its printing the url rather then the title and how also do i turn it form the list into a string?
There are several alternative to get data from webpages. The best use BeautifulSoup. In your case string split() method works well
import urllib.request
def findTitle(url):
webpage = urllib.request.urlopen(url).read()
title = str(webpage).split('<title>')[1].split('</title>')[0]
return title
>>>print(findTitle('http://www.google.com'))
Google

python fetch data from website sub pages

I am attempting to create a bot that fetches market links from steam but have run into a problem. I was able to return all the data from a single page, but when I attempt to get multiple pages it just gives me copies of the first page though I give it working links (eg: http://steamcommunity.com/market/search?q=appid%3A753#p1 and then http://steamcommunity.com/market/search?q=appid%3A753#p2). I have tested the links and they work in my browser. This is my code.
import urllib2
import random
import time
start_url = "http://steamcommunity.com/market/search?q=appid%3A753"
end_page = 3
urls = []
def get_raw(url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
return response.read()
def get_market_urls(html):
index = 0
while index != -1:
index = html.find("market_listing_row_link", index+25)
beg = html.find("http", index)
end = html.find('"',beg)
print html[beg:end]
urls.append(html[beg:end])
def go_to_page(page):
return start_url+"#p"+str(page)
def wait(min, max):
wait_t = random.randint(min,max)
time.sleep(wait_t)
for i in range(end_page):
url = go_to_page(i+1)
raw = get_raw(url)
get_market_urls(raw)
Your problem is that you've misunderstood what the URL says.
The number after the hashtag doesn't mean it's a different URL that can be fetched. This is called the query string. In that particular page the query string explains to the javascript which page to pull off AJAX. (Read about it Here and Here if you're interested..).
Anyway, you shoul look at the url: http://steamcommunity.com/market/search/render/?query=appid%3A753&start=00&count=10. You can play with the start=00&count=10 parameters to get the results you want.
Enjoy.

Extracting parts of a webpage with python

So I have a data retrieval/entry project and I want to extract a certain part of a webpage and store it in a text file. I have a text file of urls and the program is supposed to extract the same part of the page for each url.
Specifically, the program copies the legal statute following "Legal Authority:" on pages such as this. As you can see, there is only one statute listed. However, some of the urls also look like this, meaning that there are multiple separated statutes.
My code works for pages of the first kind:
from sys import argv
from urllib2 import urlopen
script, urlfile, legalfile = argv
input = open(urlfile, "r")
output = open(legalfile, "w")
def get_legal(page):
# this is where Legal Authority: starts in the code
start_link = page.find('Legal Authority:')
start_legal = page.find('">', start_link+1)
end_link = page.find('<', start_legal+1)
legal = page[start_legal+2: end_link]
return legal
for line in input:
pg = urlopen(line).read()
statute = get_legal(pg)
output.write(get_legal(pg))
Giving me the desired statute name in the "legalfile" output .txt. However, it cannot copy multiple statute names. I've tried something like this:
def get_legal(page):
# this is where Legal Authority: starts in the code
end_link = ""
legal = ""
start_link = page.find('Legal Authority:')
while (end_link != '</a> '):
start_legal = page.find('">', start_link+1)
end_link = page.find('<', start_legal+1)
end2 = page.find('</a> ', end_link+1)
legal += page[start_legal+2: end_link]
if
break
return legal
Since every list of statutes ends with '</a> ' (inspect the source of either of the two links) I thought I could use that fact (having it as the end of the index) to loop through and collect all the statutes in one string. Any ideas?
I would suggest using BeautifulSoup to parse and search your html. This will be much easier than doing basic string searches.
Here's a sample that pulls all the <a> tags found within the <td> tag that contains the <b>Legal Authority:</b> tag. (Note that I'm using requests library to fetch page content here - this is just a recommended and very easy to use alternative to urlopen.)
import requests
from BeautifulSoup import BeautifulSoup
# fetch the content of the page with requests library
url = "http://www.reginfo.gov/public/do/eAgendaViewRule?pubId=200210&RIN=1205-AB16"
response = requests.get(url)
# parse the html
html = BeautifulSoup(response.content)
# find all the <a> tags
a_tags = html.findAll('a', attrs={'class': 'pageSubNavTxt'})
def fetch_parent_tag(tags):
# fetch the parent <td> tag of the first <a> tag
# whose "previous sibling" is the <b>Legal Authority:</b> tag.
for tag in tags:
sibling = tag.findPreviousSibling()
if not sibling:
continue
if sibling.getText() == 'Legal Authority:':
return tag.findParent()
# now, just find all the child <a> tags of the parent.
# i.e. finding the parent of one child, find all the children
parent_tag = fetch_parent_tag(a_tags)
tags_you_want = parent_tag.findAll('a')
for tag in tags_you_want:
print 'statute: ' + tag.getText()
If this isn't exactly what you needed to do, BeautifulSoup is still the tool you likely want to use for sifting through html.
They provide XML data over there, see my comment. If you think you can't download that many files (or the other end could dislike so many HTTP GET requests), I'd recommend asking their admins if they would kindly provide you with a different way of accessing the data.
I have done so twice in the past (with scientific databases). In one instance the sheer size of the dataset prohibited a download; they ran a SQL query of mine and e-mailed the results (but had previously offered to mail a DVD or hard disk). In another case, I could have done some million HTTP requests to a webservice (and they were ok) each fetching about 1k bytes. This would have taken long, and would have been quite inconvenient (requiring some error-handling, since some of these requests would always time out) (and non-atomic due to paging). I was mailed a DVD.
I'd imagine that the Office of Management and Budget could possibly be similar accomodating.

Categories

Resources