I'm building a web crawler. some of the the data I input into datastore get saved, others do not get saved and I have no idea what is the problem.
here is my crawler class
class Crawler(object):
def get_page(self, url):
try:
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"}) # yessss!!! with the header, I am able to download pages
#response = urlfetch.fetch(url, method='GET')
#return response.content
#except urlfetch.InvalidURLError as iu:
# return iu.message
response = urllib2.urlopen(req)
return response.read()
except urllib2.HTTPError as e:
return e.reason
def get_all_links(self, page):
return re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',page)
def union(self, lyst1, lyst2):
try:
for elmt in lyst2:
if elmt not in lyst1:
lyst1.append(elmt)
return lyst1
except e:
return e.reason
#function that crawls the web for links starting from the seed
#returns a dictionary of index and graph
def crawl_web(self, seed="http://tonaton.com/"):
query = Listings.query() #create a listings object from storage
if query.get():
objListing = query.get()
else:
objListing = Listings()
objListing.toCrawl = [seed]
objListing.Crawled = []
start_time = datetime.datetime.now()
while datetime.datetime.now()-start_time < datetime.timedelta(0,5):#tocrawl (to crawl can take forever)
try:
#while True:
page = objListing.toCrawl.pop()
if page not in objListing.Crawled:
content = self.get_page(page)
add_page_to_index(page, content)
outlinks = self.get_all_links(content)
graph = Graph() #create a graph object with the url
graph.url = page
graph.links = outlinks #save all outlinks as the value part of the graph url
graph.put()
self.union(objListing.toCrawl, outlinks)
objListing.Crawled.append(page)
except:
return False
objListing.put() #save to database
return True #return true if it works
the classes that define the various ndb Models are in this python module:
import os
import urllib
from google.appengine.ext import ndb
import webapp2
class Listings(ndb.Model):
toCrawl = ndb.StringProperty(repeated=True)
Crawled = ndb.StringProperty(repeated=True)
#let's see how this works
class Index(ndb.Model):
keyword = ndb.StringProperty() # keyword part of the index
url = ndb.StringProperty(repeated=True) # value part of the index
#class Links(ndb.Model):
# links = ndb.JsonProperty(indexed=True)
class Graph(ndb.Model):
url = ndb.StringProperty()
links = ndb.StringProperty(repeated=True)
it used to work fine when I had JsonProperty in place of StringProperty(repeated=true). but JsonProperty is limited to 1500 bytes so I had an error once.
now, when I run the crawl_web member function, it actually crawls but when I check datastore it's only the Index entity that is created. No Graph, no Listing. please help. thanks.
Putting your code together, adding the missing imports, and logging the exception, eventually shows the first killer problem:
Exception Indexed value links must be at most 500 characters
and indeed, adding a logging of outlinks, one easily eyeballs that several of them are far longer than 500 characters -- therefore they can't be items in an indexed property, such as a StringProperty. Changing each repeated StringProperty to a repeated TextProperty (so it does not get indexed and thus has no 500-characters-per-item limitation), the code runs for a while (making a few instances of Graph) but eventually dies with:
An error occured while connecting to the server: Unable to fetch URL: https://sb':'http://b')+'.scorecardresearch.com/beacon.js';document.getElementsByTagName('head')[0].appendChild(s); Error: [Errno 8] nodename nor servname provided, or not known
and indeed, it's pretty obvious tht the alleged "link" is actually a bunch of Javascript and as such cannot be fetched.
So, essentially, the core bug in your code is not at all related to app engine, but rather, the issue is that your regular expression:
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
does not properly extract outgoing links given a web page containing Javascript as well as HTML.
There are many issues with your code, but to this point they're just slowing it down or making it harder to understand, not killing it -- what's killing it is using that regular expression pattern to try and extract links from the page.
Check out retrieve links from web page using python and BeautifulSoup -- most answers suggest, for the purpose of extracting links from a page, using BeautifulSoup, which may perhaps be a problem in app engine, but one shows how to do it with just Python and REs.
Related
I am trying to make a web crawler using Python. I am borrowing this code from Programming Collective intelligence book by Toby Segaran. Since the code from the book was outdated, I made some necessary changes but still the program doesn't execute as expected. Here is my code:
import urllib
from urllib import request
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import bs4
# Create a list of words to ignore
ignorewords=set(['the','of','to','and','a','in','is','it'])
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
pass
def __del__(self): pass
def dbcommit(self):
pass
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
return None
# Index an individual page
def addtoindex(self,url,soup):
print('Indexing %s' % url)
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
return None
# Separate the words by any non-whitespace character
def separatewords(self,text):
return None
# Return true if this url is already indexed
def isindexed(self,url):
return False
# Add a link between two pages
def addlinkref(self,urlFrom,urlTo,linkText):
pass
# Starting with a list of pages, do a breadth
# first search to the given depth, indexing pages
# as we go
def crawl(self,pages,depth=2):
pass
# Create the database tables
def createindextables(self):
pass
def crawl(self,pages,depth=2):
for i in range(depth):
newpages=set( )
for page in pages:
try:
c=request.urlopen(page)
except:
print("Could not open %s" % page)
continue
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit( )
pages=newpages
pagelist=['http://google.com']
#pagelist=['file:///C:/Users/admin/Desktop/abcd.html']
crawler=crawler('')
crawler.crawl(pagelist)
the only output I get is
"Indexing http://google.com"
"Indexing http://google.com"
press any key to continue...
Everytime I put another link in page list I get same output as "Indexing xyz" where xyz is every link I put in pagelist. I also tried making a HTML file with lots of <a> tags but it didn't work too.
The problem is in your line link=soup('a'). If you want to find elements of class 'a', you should use the different methods named 'find_element_by...' (cf bs4 documentation)
import requests
MSA_request=""">G1
MGCTLSAEDKAAVERSKMIDRNLREDGEKAAREVKLLLL
>G2
MGCTVSAEDKAAAERSKMIDKNLREDGEKAAREVKLLLL
>G3
MGCTLSAEERAALERSKAIEKNLKEDGISAAKDVKLLLL"""
q={"stype":"protein","sequence":MSA_request,"outfmt":"clustal"}
r=requests.post("http://www.ebi.ac.uk/Tools/msa/clustalo/",data=q)
This is my script, I send this request to website, but the result looks like I did nothing, web service didn't receive my request. This method used to be fine with other website, maybe this page with a pop window to ask cookie agreement?
The form on the page you are referring to has a separate URL, namely
http://www.ebi.ac.uk/Tools/services/web_clustalo/toolform.ebi
you can verify this with a DOM inspector in your browser.
So in order to proceed with requests, you need to access the right page
r=requests.post("http://www.ebi.ac.uk/Tools/services/web_clustalo/toolform.ebi",data=q)
this will submit a job with your input data, it doesn't return the result directly. To check the results, it's necessary to extract the job ID from the previous response and then generate another request (with no data) to
http://www.ebi.ac.uk/Tools/services/web_clustalo/toolresult.ebi?jobId=...
However, you should definitely check whether this programatic access is compatible with the TOS of that website...
Here is an example:
from lxml import html
import requests
import sys
import time
MSA_request=""">G1
MGCTLSAEDKAAVERSKMIDRNLREDGEKAAREVKLLLL
>G2
MGCTVSAEDKAAAERSKMIDKNLREDGEKAAREVKLLLL
>G3
MGCTLSAEERAALERSKAIEKNLKEDGISAAKDVKLLLL"""
q={"stype":"protein","sequence":MSA_request,"outfmt":"clustal"}
r = requests.post("http://www.ebi.ac.uk/Tools/services/web_clustalo/toolform.ebi",data = q)
tree = html.fromstring(r.text)
title = tree.xpath('//title/text()')[0]
#check the status and get the job id
status, job_id = map(lambda s: s.strip(), title.split(':', 1))
if status != "Job running":
sys.exit(1)
#it might take some time for the job to finish
time.sleep(10)
#download the results
r = requests.get("http://www.ebi.ac.uk/Tools/services/web_clustalo/toolresult.ebi?jobId=%s" % (job_id))
#prints the full response
#print(r.text)
#isolate the alignment block
tree = html.fromstring(r.text)
alignment = tree.xpath('//pre[#id="alignmentContent"]/text()')[0]
print(alignment)
I usually write function-only Python programs, but have decided on OOD approach (my first thereof) for my current program, a web-scraper:
import csv
import urllib2
NO_VACANCIES = ['no vacancies', 'not hiring']
class Page(object):
def __init__(self, url):
self.url = url
def get_source(self):
self.source = urllib2.urlopen(url).read()
return self.source
class HubPage(Page):
def has_vacancies(self):
return not(any(text for text in NO_VACANCIES if text in self.source.lower()))
urls = []
with open('25.csv', 'rb') as spreadsheet:
reader = csv.reader(spreadsheet)
for row in reader:
urls.append(row[0].strip())
for url in urls:
page = HubPage(url)
source = page.get_source()
if page.has_vacancies():
print 'Has vacancies'
Some context: HubPage represents a typical 'jobs' page on a company's web site. I am subclassing Page because I well eventually subclass it again for individual job pages, and some methods that will be used only to extract data for individual job pages (this may be overkill).
Here's my issue: I know from experience that urllib2, while it has its critics, is fast - very fast - at doing what it does, namely fetch a page's source. Yet I notice that in my design, processing of each url is taking a few orders of magnitude longer than what I typically observe.
Is it the fact that class instantiations are involved (unnecessarily,
perhaps)?
Might the fact that HubPage is inherited be at cause?
Is the call to any() known to be expensive when it contains a list comprehension such as it does here?
I am trying to get the names of members of a group I am a member of. I am able to get the names in the first page but not sure how to go to the next page:
My Code:
url = 'https://graph.facebook.com/v2.5/1671554786408615/members?access_token=<MY_CUSTOM_ACCESS_CODE_HERE>'
json_obj = urllib2.urlopen(url)
data = json.load(json_obj)
for each in data['data']:
print each['name']
Using the code above I am successfully getting all names on the first page but question is -- how do I go to the next page?
In the Graph API Explorer Output screen I see this:
What change does my code need to keep going to next pages and get names of ALL members of the group?
The JSON returned by the Graph API is telling you where to get the next page of data, in data['paging']['next']. You could give something like this a try:
def printNames():
json_obj = urllib2.urlopen(url)
data = json.load(json_obj)
for each in data['data']:
print each['name']
return data['paging']['next'] # Return the URL to the next page of data
url = 'https://graph.facebook.com/v2.5/1671554786408615/members?access_token=<MY_CUSTOM_ACCESS_CODE_HERE>'
url = printNames()
print "====END OF PAGE 1===="
url = printNames()
print "====END OF PAGE 2===="
You would need to add checks, for instance ['paging']['next'] will only be available in your JSON object if there is a next page, so you might want to modify your function to return a more complex structure to convey this information, but this should give you the idea.
I am attempting to create a bot that fetches market links from steam but have run into a problem. I was able to return all the data from a single page, but when I attempt to get multiple pages it just gives me copies of the first page though I give it working links (eg: http://steamcommunity.com/market/search?q=appid%3A753#p1 and then http://steamcommunity.com/market/search?q=appid%3A753#p2). I have tested the links and they work in my browser. This is my code.
import urllib2
import random
import time
start_url = "http://steamcommunity.com/market/search?q=appid%3A753"
end_page = 3
urls = []
def get_raw(url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
return response.read()
def get_market_urls(html):
index = 0
while index != -1:
index = html.find("market_listing_row_link", index+25)
beg = html.find("http", index)
end = html.find('"',beg)
print html[beg:end]
urls.append(html[beg:end])
def go_to_page(page):
return start_url+"#p"+str(page)
def wait(min, max):
wait_t = random.randint(min,max)
time.sleep(wait_t)
for i in range(end_page):
url = go_to_page(i+1)
raw = get_raw(url)
get_market_urls(raw)
Your problem is that you've misunderstood what the URL says.
The number after the hashtag doesn't mean it's a different URL that can be fetched. This is called the query string. In that particular page the query string explains to the javascript which page to pull off AJAX. (Read about it Here and Here if you're interested..).
Anyway, you shoul look at the url: http://steamcommunity.com/market/search/render/?query=appid%3A753&start=00&count=10. You can play with the start=00&count=10 parameters to get the results you want.
Enjoy.