Scrapy not collecting properly emails - python

I'm using Scrapy to collect some data and everything works fine except the email extraction part. For some reason email row in .csv file is blank or there is only a few emails extracted. I've tried limiting download_delay and CLOSESPIDER_ITEMCOUNT but it's not working. Any help is much appreciated.
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["hanford.craigslist.org"]
start_urls = [
"http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
]
BASE_URL = 'http://hanford.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/sdo/cto/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item

First of all, a quote from Terms of Use as a warning:
USE. You agree not to use or provide software (except for general
purpose web browsers and email clients, or software expressly licensed
by us) or services that interact or interoperate with CL, e.g. for
downloading, uploading, posting, flagging, emailing, search, or mobile
use. Robots, spiders, scripts, scrapers, crawlers, etc. are
prohibited, as are misleading, unsolicited, unlawful, and/or spam
postings/email. You agree not to collect users' personal and/or
contact information ("PI").
Several things to fix here:
the contact information is under reply/hnf/cto/ instead of reply/sdo/cto/
specify User-Agent and X-Requested-With headers
The complete code that works for me:
import re
from urlparse import urljoin
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["hanford.craigslist.org"]
start_urls = [
"http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
]
BASE_URL = 'http://hanford.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = urljoin(self.BASE_URL, link)
yield scrapy.Request(absolute_url,
callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = urljoin(self.BASE_URL, "reply/hnf/cto/" + item_id)
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url,
meta={'item': item},
callback=self.parse_contact,
headers={"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36"})
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.
Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

scrapy.Reaquests() callback not work

There are lots of questions about this though, but most of people encounter this problem because of the "dont_filter' argument. I passed this argument "dont_filter = True" but my custom parse generator still didn't work. Here is my code(the third parser "parse_spec" had never been called, "parse_models_follow_next_page" just work well when called by parse(), but it can't call itself when it need to turn to next page):
import scrapy
from gsmarena.items import PhoneItems
class VendorSpider(scrapy.Spider):
custom_settings = {
'DOWNLOAD_DELAY': 1.5,
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A',
'COOKIES_ENABLED': False
}
name = "gsmarena_spec"
allowed_domains = ["https://www.gsmarena.com/"]
start_urls = [
"https://www.gsmarena.com/makers.php3"
]
def parse(self, response):
# print("Existing settings: %s" % self.settings.attributes.items())
length = len(response.xpath("//table//a").extract())
for i in range(1, length):
brand = response.xpath(
'(//table//a)[{}]/text()'.format(i)).extract()[0]
url = "https://www.gsmarena.com/" + \
response.xpath("(//table//a)[{}]/#href".format(i)).extract()[0]
yield scrapy.Request(url, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)
def parse_models_follow_next_page(self, response):
brand = response.meta.get('brand')
length = len(response.xpath(
"//div[#class='makers']/self::div//a").extract())
for i in range(1, length):
url = "https://www.gsmarena.com/" + \
response.xpath(
"(//div[#class='makers']/self::div//a)[{}]/#href".format(i)).extract()[0]
model = response.xpath(
"(//div[#class='makers']/self::div//a//span/text())[{}]".format(i)).extract()[0]
yield scrapy.Request(url, callback=self.parse_spec, meta={'brand': brand, 'model': model}, dont_filter=True)
is_next_page = response.xpath(
"//a[#class=\"pages-next\"]/#href").extract()
if is_next_page:
next_page = "https://www.gsmarena.com/" + is_next_page[0]
yield scrapy.Request(next_page, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)
def parse_spec(self, response):
item = PhoneItems()
item['model'] = response.meta.get('model')
item['brand'] = response.meta.get('brand')
for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
item[spec_name] = spec
yield item
and sorry for my bad English.
You scraper has few issues.
allowed_domains = ["https://www.gsmarena.com/"]
should be
allowed_domains = ["www.gsmarena.com"]
Next you don't have errback_httpbin method defined in your class
def errback_httpbin(self, response):
pass
Below code
for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
should be
for spec_name, spec in zip(response.xpath('//table//td[1]/text()').extract(), response.xpath('//table//td[2]/text()').extract()):
This though still has some issues.
Also your code would take some time for the first yield, as the scheduler will pick url based on the order of urls coming in
I have made some changes in the code and it scraps all the results expect the spec_name, that is not specified in an understanding way.
import scrapy
from lxml import html
from tutorial.items import PhoneItems
class VendorSpider(scrapy.Spider):
custom_settings = {
'DOWNLOAD_DELAY': 1.5,
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)
AppleWebKit/537.75.14 (KHTML, '
'like Gecko) Version/7.0.3 Safari/7046A194A',
'COOKIES_ENABLED': False
}
name = "gsmarena_spec"
allowed_domains = ["https://www.gsmarena.com/"]
start_urls = [
"https://www.gsmarena.com/makers.php3"
]
def parse(self, response):
# print("Existing settings: %s" %
self.settings.attributes.items())
length = len(response.xpath("//table//a").extract())
for i in range(1, length):
brand = response.xpath(
'(//table//a)[{}]/text()'.format(i)).extract()[0]
url = "https://www.gsmarena.com/" + \
response.xpath("(//table//a)
[{}]/#href".format(i)).extract()[0]
yield scrapy.Request(url,
callback=self.parse_models_follow_next_page,
meta={'brand':brand},dont_filter=True)
def parse_models_follow_next_page(self, response):
brand = response.meta.get('brand')
meta = response.meta
doc = html.fromstring(response.body)
single_obj = doc.xpath('.//div[#class="makers"]/ul//li')
for obj in single_obj:
url = self.allowed_domains[0]+obj.xpath('.//a/#href')[0]
meta['brand'] = obj.xpath('.//a/#href')[0].split('_')[0]
meta['model'] = obj.xpath('.//a/#href')[0]
yield scrapy.Request(url=url, callback=self.parse_spec,
meta=meta, dont_filter=True)
is_next_page = response.xpath(
"//a[#class=\"pages-next\"]/#href").extract()
if is_next_page:
next_page = "https://www.gsmarena.com/" + is_next_page[0]
yield scrapy.Request(next_page,
callback=self.parse_models_follow_next_page,
meta={'brand': brand},dont_filter=True)
def parse_spec(self, response):
item = PhoneItems()
meta = response.meta
item['model'] = meta['model']
item['brand'] = meta['brand']
#Need to specify details about spec_name
# for spec_name, spec in
#zip(response.xpath('//table//td[1]').extract(),
# response.xpath('//table//td[2]').extract()):
# item[spec_name] = spec
yield item

What are the best practices for calling an external api?

So let's say I want to write a spider that using the Facebook API to calculate the likes on every page of a website. If I import the requests library, I'm able to call the Facebook graph API as follows.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
data=requests.get(base)
return self.parse_likes(data)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
item['url'] = response.url
links = response.css('a::attr(href)').extract()
item['fb_url'],item['shares'],item['comments'] = self.get_likes(response.url)
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
However, I can't seem to get this code to work if, rather than using the requests, I use the scrapy.Request call. Something like this.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
return scrapy.Request(base,callback=self.parse_likes)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
links = response.css('a::attr(href)').extract()
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
In this case, I just get a blank response for the Facebook data. I think i'm missing some understanding about how the scrapy.Request method works relative to the standard requests library. Any ideas?
This is a very common case: How to yield from item from multiple urls?
And the most common solution is to chain requests by carrying your item in request.meta paramater.
For your example implementation with this logic could look like:
class WebSite(scrapy.Spider):
base='https://graph.facebook.com/{}?access_token={}'.format
api_key = '1234'
def parse(self, response):
links = response.css('a::attr(href)').extract()
for link in links:
item= {}
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
item['link'] = response.urljoin(link)
api_url = self.base(self.api_key, link)
yield scrapy.Request(api_url,
callback=self.parse_likes,
meta={'item': item})
def parse_likes(self, response):
item = response.meta['item']
data = json.loads(data.text)
share_count = data['id'],data['share']['comment_count'],data['share']['share_count']
item['share_count'] = share_count
yield item

scraping url and title from nested anchor tag

This is my first scraper using scrapy.
I am trying to scrap video url, title from https://www.google.co.in/trends/hotvideos#hvsm=0 site.
import scrapy
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class CraigslistItem(Item):
title = Field()
link = Field()
class DmozSpider(scrapy.Spider):
name = "google"
allowed_domains = ["google.co.in"]
start_urls = [
"https://www.google.co.in/trends/hotvideos#hvsm=0"
]
def parse(self, response):
#for sel in response.xpath('//body/div'):
hxs = HtmlXPathSelector(response)
sites = hxs.xpath("//span[#class='single-video-image-container']")
items = []
for sel in response.xpath("//span[#class='single-video-image-container']"):
item = CraigslistItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
items.append(item)
print items
General walk through of what I am doing wrong would be much appreciable.
Use the help Scrapy FormRequest to get it done.
from scrapy.http import FormRequest
import json
class DmozSpider(scrapy.Spider):
name = "google"
allowed_domains = ["google.co.in"]
start_urls = [
"https://www.google.co.in/trends/hotvideos#hvsm=0"
]
def parse(self, response):
url = 'https://www.google.co.in/trends/hotvideos/hotItems'
formdata = {'hvd':'','geo': 'IN','mob': '0','hvsm': '0'}
yield FormRequest(url=url, formdata=formdata, callback=self.parse_data)
def parse_data(self, response):
json_response = json.loads(response.body)
videos = json_response.get('videoList')
for video in videos:
item = CraigslistItem()
item['title'] = video.get('title')
item['link'] = video.get('url')
yield item

Scrapy delay request

every time i run my code my ip gets banned. I need help to delay each request for 10 seconds. I've tried to place DOWNLOAD_DELAY in code but it gives no results. Any help is appreciated.
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"https://washingtondc.craigslist.org/search/fua"
]
BASE_URL = 'https://washingtondc.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/nos/vgm/" + item_id
item = DmozItem()
item["link"] = response.url
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
You need to set DOWNLOAD_DELAY in settings.py of your project. Note that you may also need to limit concurrency. By default concurrency is 8 so you are hitting website with 8 simultaneous requests.
# settings.py
DOWNLOAD_DELAY = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 2
Starting with Scrapy 1.0 you can also place custom settings in spider, so you could do something like this:
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]
custom_settings = {
"DOWNLOAD_DELAY": 5,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2
}
Delay and concurrency are set per downloader slot not per requests. To actually check what download you have you could try something like this
def parse(self, response):
"""
"""
delay = self.crawler.engine.downloader.slots["www.dmoz.org"].delay
concurrency = self.crawler.engine.downloader.slots["www.dmoz.org"].concurrency
self.log("Delay {}, concurrency {} for request {}".format(delay, concurrency, response.request))
return

Categories

Resources