2 functions in scrapy spider and the second one not running - python

I am using scrapy to get the content inside some urls on a page, similar to this question here:
Use scrapy to get list of urls, and then scrape content inside those urls
I am able to get the subURLs from my start urls(first def), However, my second def doesn't seem to be passing through. And the result file is empty. I have tested the content inside the function in scrapy shell and it is getting the info I want, but not when I am running the spider.
import scrapy
from scrapy.selector import Selector
#from scrapy import Spider
from WheelsOnlineScrapper.items import Dealer
from WheelsOnlineScrapper.url_list import urls
import logging
from urlparse import urljoin
logger = logging.getLogger(__name__)
class WheelsonlinespiderSpider(scrapy.Spider):
logger.info('Spider starting')
name = 'wheelsonlinespider'
rotate_user_agent = True # lives in middleware.py and settings.py
allowed_domains = ["https://wheelsonline.ca"]
start_urls = urls # this list is created in url_list.py
logger.info('URLs retrieved')
def parse(self, response):
subURLs = []
partialURLs = response.css('.directory_name::attr(href)').extract()
for i in partialURLs:
subURLs = urljoin('https://wheelsonline.ca/', i)
yield scrapy.Request(subURLs, callback=self.parse_dealers)
logger.info('Dealer ' + subURLs + ' fetched')
def parse_dealers(self, response):
logger.info('Beginning of page')
dlr = Dealer()
#Extracting the content using css selectors
try:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first() + ' ' + response.css(".dealer_head_aux_name::text").extract_first()
except TypeError:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first()
dlr['MailingAddress'] = ','.join(response.css(".dealer_address_right::text").extract())
dlr['PhoneNumber'] = response.css(".dealer_head_phone::text").extract_first()
logger.info('Dealer fetched ' + dlr['DealerName'])
yield dlr
logger.info('End of page')

Your allowed_domains list contains the protocol (https). It should have only the domain name as per the documentation:
allowed_domains = ["wheelsonline.ca"]
Also, you should've received a message in your log:
URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://wheelsonline.ca in allowed_domains

Related

Running Scrapy crawler from a main function

Ihave wriiten a crawler in scrapy but I would want to initiate the crwaling by using main method
import sys, getopt
import scrapy
from scrapy.spiders import Spider
from scrapy.http import Request
import re
class TutsplusItem(scrapy.Item):
title = scrapy.Field()
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
def __init__(self, *args):
try:
opts, args = getopt.getopt(args, "hi:o:", ["ifile=", "ofile="])
except getopt.GetoptError:
print 'test.py -i <inputfile> -o <outputfile>'
sys.exit(2)
super(MySpider, self).__init__(self,*args)
def parse(self, response):
links = response.xpath('//a/#href').extract()
# We stored already crawled links in this list
crawledLinks = []
# Pattern to check proper link
# I only want to get the tutorial posts
# linkPattern = re.compile("^\/tutorials\?page=\d+")
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
#if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(#class, "media__link")]/text()').extract()
count=0
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" %title)
yield item
Instead of using scrapy runspider Crawler.py arg1 arg2
I would like to have a seprate class with main function and initiate scrapy from there. How to this?
There are different ways to approach this, but I suggest the following:
Have a main.py file on the same directory that will open a new process and launch the spider with the parameters you need.
The main.py file would have something like the following:
import subprocess
scrapy_command = 'scrapy runspider {spider_name} -a param_1="{param_1}"'.format(spider_name='your_spider', param_1='your_value')
process = subprocess.Popen(scrapy_command, shell=True)
With this code, you just need to call your main file.
python main.py
Hope it helps.

missing data using scrapy

I'm using scrapy to get the data of
http://www.bbb.org/greater-san-francisco/business-reviews/architects/klopf-architecture-in-san-francisco-ca-152805
So I created some items to save the information, but I don't get all the data every time I run the script, usually I get some empty items so I need to run the script again until I get all the items.
This is the code of the spider
import scrapy
from tutorial.items import Product
from scrapy.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["bbb.org/"]
start_urls = [
"http://www.bbb.org/greater-san-francisco/business-reviews/architects/klopf-architecture-in-san-francisco-ca-152805"
#"http://www.bbb.org/greater-san-francisco/business-reviews/architects/a-d-architects-in-oakland-ca-133229"
#"http://www.bbb.org/greater-san-francisco/business-reviews/architects/aecom-in-concord-ca-541360"
]
def parse(self, response):
filename = response.url.split("/")[-2] + '.html'
with open(filename, 'wb') as f:
f.write(response.body)
producto = Product()
#producto['name'] = response.xpath('//*[#id="business-detail"]/div/h1')
producto = Product(Name=response.xpath('//*[#id="business-detail"]/div/h1/text()').extract(),
Telephone=response.xpath('//*[#id="business-detail"]/div/p/span[1]/text()').extract(),
Address=response.xpath('//*[#id="business-detail"]/div/p/span[2]/span[1]/text()').extract(),
Description=response.xpath('//*[#id="business-description"]/p[2]/text()').extract(),
BBBAccreditation =response.xpath('//*[#id="business-accreditation-content"]/p[1]/text()').extract(),
Complaints=response.xpath('//*[#id="complaint-sort-container"]/text()').extract(),
Reviews=response.xpath('//*[#id="complaint-sort-container"]/p/text()').extract(),
WebPage=response.xpath('//*[#id="business-detail"]/div/p/span[3]/a/text()').extract(),
Rating = response.xpath('//*[#id="accedited-rating"]/img/text()').extract(),
ServiceArea = response.xpath('//*[#id="business-additional-info-text"]/span[4]/p/text()').extract(),
ReasonForRating = response.xpath('//*[#id="reason-rating-content"]/ul/li[1]/text()').extract(),
NumberofEmployees = response.xpath('//*[#id="business-additional-info-text"]/p[8]/text()').extract(),
LicenceNumber = response.xpath('//*[#id="business-additional-info-text"]/p[6]/text()').extract(),
Contact = response.xpath('//*[#id="business-additional-info-text"]/span[3]/span/span[1]/text()').extract(),
BBBFileOpened = response.xpath('//*[#id="business-additional-info-text"]/span[3]/span/span[1]/text()').extract(),
BusinessStarted = response.xpath('//*[#id="business-additional-info-text"]/span[3]/span/span[1]/text()').extract(),)
#producto.add_xpath('name', '//*[#id="business-detail"]/div/h1')
#product.add_value('name', 'today') # you can also use literal values
#product.load_item()
return producto
This page requieres to set an user agent, so I have a file of user agents, could be than some of them are wrong?
Yes, some of your user agents could be wrong (maybe some old ones, deprecated) and the site, if there is no problem on using only one user-agent, you could add that to settings.py:
USER_AGENT="someuseragent"
Remember to remove or disable the randoming user-agent also from settings.py

Scrapy spider results can't be pipelined into database when ran from a script [duplicate]

This question already has answers here:
Getting scrapy project settings when script is outside of root directory
(6 answers)
Closed 9 months ago.
I've written a Scrapy spider that I am trying to run from a python script located in another directory. The code I'm using from the docs seems to run the spider, but when I check the postgresql table, it hasn't been created. The spider only properly pipelines the scraped data if I use the scrapy crawl command. I've tried placing the script in the directory right above the scrapy project and also in the same directory as the config file and neither seem to be creating the table.
The code for the script is below followed by the code for the spider. I think the problem involves the directory in which the script should be place and/or the code that I use within the spider file to enable the spider to be ran from a script, but I'm not sure. Does it look like there is a problem with the function that is being called in the script or is there something that needs to be changed within the settings file? I can provide the code for the pipelines file if necessary, thanks.
Script file (only 3 lines)
from ticket_city_scraper import *
from ticket_city_scraper.spiders import tc_spider
tc_spider.spiderCrawl()
Spider file
import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from ticket_city_scraper.items import ComparatorItem
from urlparse import urljoin
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
bandname = raw_input("Enter bandname\n")
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[#class = "vevent"]'
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
self.start_urls = [tc_url]
#return tc_url
tickets_list_xpath = './/div[#class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[#itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[#itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[#class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[#class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[#class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[#class="divEventDetails url"]/#href')
#loader.add_xpath('eventDateTime' , '//div[#id="divEventDate"]/#title') #datetime type
#loader.add_xpath('eventTime' , './/*[#class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[#id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def spiderCrawl():
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(MySpider3)
process.start()
It's because your settings object only contains a user agent. Your project settings are what determine which pipeline gets ran. From scrapy docs:
You can
automatically import your spiders passing their name to
CrawlerProcess, and use get_project_settings to get a Settings
instance with your project settings.
more info here http://doc.scrapy.org/en/latest/topics/practices.html
Read more than the first example.
The settings are not beeing read if you run from a parent folder.
This answer helped me:
Getting scrapy project settings when script is outside of root directory

Using Scrapy to crawl websites and only scrape pages containing keywords

I'm trying to crawl various websites looking for particular keywords of interest and only scraping those pages. I've written the script to run as a standalone Python script rather than the traditional Scrapy project structure (following this example) and using the CrawlSpider class. The idea is that from a given homepage the Spider will crawl pages within that domain and only scrape links from pages which contain the keyword. I'm also trying to save a copy of the page when I find one containing the keyword. The previous version of this question related to a syntax error (see comments below, thanks #tegancp for helping me clear that up) but now although my code runs I am still unable to crawl links only on pages of interest as intended.
I think I want to either i) remove the call to LinkExtractor in the __init__ function or ii) only call LinkExtractor from within __init__ but with a rule based on what I find when I visit that page rather than some attribute of the URL. I can't do i) because the CrawlSpider class wants a rule and I can't do ii) because LinkExtractor doesn't have a process_links option like the old SgmlLinkExtractor which appears to be deprecated. I'm new to Scrapy so wondering if my only option is to write my own LinkExtractor?
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# define an item class
class GenItem(Item):
url = Field()
# define a spider
class GenSpider(CrawlSpider):
name = "genspider3"
# requires 'start_url', 'allowed_domains' and 'folderpath' to be passed as string arguments IN THIS PARTICULAR ORDER!!!
def __init__(self):
self.start_urls = [sys.argv[1]]
self.allowed_domains = [sys.argv[2]]
self.folder = sys.argv[3]
self.writefile1 = self.folder + 'hotlinks.txt'
self.writefile2 = self.folder + 'pages.txt'
self.rules = [Rule(LinkExtractor(allow_domains=(sys.argv[2],)), follow=True, callback='parse_links')]
super(GenSpider, self).__init__()
def parse_start_url(self, response):
# get list of links on start_url page and process using parse_links
list(self.parse_links(response))
def parse_links(self, response):
# if this page contains a word of interest save the HTML to file and crawl the links on this page
theHTML = response.body
if 'keyword' in theHTML:
with open(self.writefile2, 'a+') as f2:
f2.write(theHTML + '\n')
with open(self.writefile1, 'a+') as f1:
f1.write(response.url + '\n')
for link in LinkExtractor(allow_domains=(sys.argv[2],)).extract_links(response):
linkitem = GenItem()
linkitem['url'] = link.url
log.msg(link.url)
with open(self.writefile1, 'a+') as f1:
f1.write(link.url + '\n')
return linkitem
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
# instantiate settings and provide a custom configuration
settings = Settings()
#settings.set('DEPTH_LIMIT', 2)
settings.set('DOWNLOAD_DELAY', 0.25)
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = GenSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start(loglevel=log.DEBUG)
# start the reactor (blocks execution)
reactor.run()

How to avoid duplication in a crawler

I wrote a crawler using the scrapy framework in python to select some links and meta tags.It then crawls the start urls and write the data in a JSON encoded format onto a file.The problem is that when the crawler is run two or three times with the same start urls the data in the file gets duplicated .To avoid this I used a downloader middleware in scrapy which is this : http://snippets.scrapy.org/snippets/1/
What I did was copy and paste the above code in a file inside my scrapy project and I enabled it in the settings.py file by adding the following line:
SPIDER_MIDDLEWARES = {'a11ypi.removeDuplicates.IgnoreVisitedItems':560}
where "a11ypi.removeDuplicates.IgnoreVisitedItems" is the class path name and finally I went in and modified my items.py file and included the following fields
visit_id = Field()
visit_status = Field()
But this doesn't work and still the crawler produces the same result appending it to the file when run twice
I did the writing to the file in my pipelines.py file as follows:
import json
class AYpiPipeline(object):
def __init__(self):
self.file = open("a11ypi_dict.json","ab+")
# this method is called to process an item after it has been scraped.
def process_item(self, item, spider):
d = {}
i = 0
# Here we are iterating over the scraped items and creating a dictionary of dictionaries.
try:
while i<len(item["foruri"]):
d.setdefault(item["foruri"][i],{}).setdefault(item["rec"][i],{})[item["foruri_id"][i]] = item['thisurl'] + ":" +item["thisid"][i]
i+=1
except IndexError:
print "Index out of range"
json.dump(d,self.file)
return item
And my spider code is as follows:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from a11ypi.items import AYpiItem
class AYpiSpider(CrawlSpider):
name = "a11y.in"
allowed_domains = ["a11y.in"]
# This is the list of seed URLs to begin crawling with.
start_urls = ["http://www.a11y.in/a11ypi/idea/fire-hi.html"]
# This is the callback method, which is used for scraping specific data
def parse(self,response):
temp = []
hxs = HtmlXPathSelector(response)
item = AYpiItem()
wholeforuri = hxs.select("//#foruri").extract() # XPath to extract the foruri, which contains both the URL and id in foruri
for i in wholeforuri:
temp.append(i.rpartition(":"))
item["foruri"] = [i[0] for i in temp] # This contains the URL in foruri
item["foruri_id"] = [i.split(":")[-1] for i in wholeforuri] # This contains the id in foruri
item['thisurl'] = response.url
item["thisid"] = hxs.select("//#foruri/../#id").extract()
item["rec"] = hxs.select("//#foruri/../#rec").extract()
return item
Kindly suggest what to do.
try to understand why the snippet is written as it is:
if isinstance(x, Request):
if self.FILTER_VISITED in x.meta:
visit_id = self._visited_id(x)
if visit_id in visited_ids:
log.msg("Ignoring already visited: %s" % x.url,
level=log.INFO, spider=spider)
visited = True
Notice in line 2, you actually require a key in in Request.meta called FILTER_VISITED in order for the middleware to drop the request. The reason is well-intended because every single url you have visited will be skipped and you will not have urls to tranverse at all if you do not do so. So, FILTER_VISITED actually allows you to choose what url patterns you want to skip. If you want links extracted with a particular rule skipped, just do
Rule(SgmlLinkExtractor(allow=('url_regex1', 'url_regex2' )), callback='my_callback', process_request = setVisitFilter)
def setVisitFilter(request):
request.meta['filter_visited'] = True
return request
P.S I do not know if it works for 0.14 and above as some of the code has changed for storing spider context in the sqlite db.

Categories

Resources