There are lots of questions about this though, but most of people encounter this problem because of the "dont_filter' argument. I passed this argument "dont_filter = True" but my custom parse generator still didn't work. Here is my code(the third parser "parse_spec" had never been called, "parse_models_follow_next_page" just work well when called by parse(), but it can't call itself when it need to turn to next page):
import scrapy
from gsmarena.items import PhoneItems
class VendorSpider(scrapy.Spider):
custom_settings = {
'DOWNLOAD_DELAY': 1.5,
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A',
'COOKIES_ENABLED': False
}
name = "gsmarena_spec"
allowed_domains = ["https://www.gsmarena.com/"]
start_urls = [
"https://www.gsmarena.com/makers.php3"
]
def parse(self, response):
# print("Existing settings: %s" % self.settings.attributes.items())
length = len(response.xpath("//table//a").extract())
for i in range(1, length):
brand = response.xpath(
'(//table//a)[{}]/text()'.format(i)).extract()[0]
url = "https://www.gsmarena.com/" + \
response.xpath("(//table//a)[{}]/#href".format(i)).extract()[0]
yield scrapy.Request(url, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)
def parse_models_follow_next_page(self, response):
brand = response.meta.get('brand')
length = len(response.xpath(
"//div[#class='makers']/self::div//a").extract())
for i in range(1, length):
url = "https://www.gsmarena.com/" + \
response.xpath(
"(//div[#class='makers']/self::div//a)[{}]/#href".format(i)).extract()[0]
model = response.xpath(
"(//div[#class='makers']/self::div//a//span/text())[{}]".format(i)).extract()[0]
yield scrapy.Request(url, callback=self.parse_spec, meta={'brand': brand, 'model': model}, dont_filter=True)
is_next_page = response.xpath(
"//a[#class=\"pages-next\"]/#href").extract()
if is_next_page:
next_page = "https://www.gsmarena.com/" + is_next_page[0]
yield scrapy.Request(next_page, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)
def parse_spec(self, response):
item = PhoneItems()
item['model'] = response.meta.get('model')
item['brand'] = response.meta.get('brand')
for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
item[spec_name] = spec
yield item
and sorry for my bad English.
You scraper has few issues.
allowed_domains = ["https://www.gsmarena.com/"]
should be
allowed_domains = ["www.gsmarena.com"]
Next you don't have errback_httpbin method defined in your class
def errback_httpbin(self, response):
pass
Below code
for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
should be
for spec_name, spec in zip(response.xpath('//table//td[1]/text()').extract(), response.xpath('//table//td[2]/text()').extract()):
This though still has some issues.
Also your code would take some time for the first yield, as the scheduler will pick url based on the order of urls coming in
I have made some changes in the code and it scraps all the results expect the spec_name, that is not specified in an understanding way.
import scrapy
from lxml import html
from tutorial.items import PhoneItems
class VendorSpider(scrapy.Spider):
custom_settings = {
'DOWNLOAD_DELAY': 1.5,
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)
AppleWebKit/537.75.14 (KHTML, '
'like Gecko) Version/7.0.3 Safari/7046A194A',
'COOKIES_ENABLED': False
}
name = "gsmarena_spec"
allowed_domains = ["https://www.gsmarena.com/"]
start_urls = [
"https://www.gsmarena.com/makers.php3"
]
def parse(self, response):
# print("Existing settings: %s" %
self.settings.attributes.items())
length = len(response.xpath("//table//a").extract())
for i in range(1, length):
brand = response.xpath(
'(//table//a)[{}]/text()'.format(i)).extract()[0]
url = "https://www.gsmarena.com/" + \
response.xpath("(//table//a)
[{}]/#href".format(i)).extract()[0]
yield scrapy.Request(url,
callback=self.parse_models_follow_next_page,
meta={'brand':brand},dont_filter=True)
def parse_models_follow_next_page(self, response):
brand = response.meta.get('brand')
meta = response.meta
doc = html.fromstring(response.body)
single_obj = doc.xpath('.//div[#class="makers"]/ul//li')
for obj in single_obj:
url = self.allowed_domains[0]+obj.xpath('.//a/#href')[0]
meta['brand'] = obj.xpath('.//a/#href')[0].split('_')[0]
meta['model'] = obj.xpath('.//a/#href')[0]
yield scrapy.Request(url=url, callback=self.parse_spec,
meta=meta, dont_filter=True)
is_next_page = response.xpath(
"//a[#class=\"pages-next\"]/#href").extract()
if is_next_page:
next_page = "https://www.gsmarena.com/" + is_next_page[0]
yield scrapy.Request(next_page,
callback=self.parse_models_follow_next_page,
meta={'brand': brand},dont_filter=True)
def parse_spec(self, response):
item = PhoneItems()
meta = response.meta
item['model'] = meta['model']
item['brand'] = meta['brand']
#Need to specify details about spec_name
# for spec_name, spec in
#zip(response.xpath('//table//td[1]').extract(),
# response.xpath('//table//td[2]').extract()):
# item[spec_name] = spec
yield item
Related
import scrapy
from scrapy import Request
from NPM.items import NPMItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Join, Compose
import re
import cfscrape
from scrapy_splash import SplashRequest
# import requests
# session=requests.Session()
# print(session.cookies.getdict())
# response = session.get('http://google.com')
# print(session.cookies.get_dict())
script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
}
end
"""
class ExampleCrawler(scrapy.Spider):
name = 'Example'
custom_settings = {
'RETRY_TIMES': 5,
'DOWNLOAD_DELAY': 3,
'CONCURRENT_REQUESTS': 20,
'CONCURRENT_REQUESTS_PER_DOMAIN': 20,
'CONCURRENT_REQUESTS_PER_IP': 20,
'AUTOTHROTTLE_ENABLED': True,
'COOKIES_ENABLED': True,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36 ',
'PROXY_LIST': 'EXAMPLE/proxy.txt'
}
allowed_domains = ['example.com']
start_urls = ['https://example/real-estate/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'X-My-Header': 'value'},
)
def parse(self, response):
properties = response.xpath('//*[#id="mbuzz"]/following-sibling::table')[0:-1]
for property in properties:
links = property.xpath('.//#href').extract_first()
urlo = response.urljoin(links)
link = urlo.replace('/real-estate', '')
# head=response.headers
#
# token,u_a=cfscrape.get_tokens(link)
# cfduid=token['__cfduid']
#
# cook=response.headers.getlist('Set-Cookie')
# # HEAD=Request.meta
# cook=str(cook)
# if re.search('PHPSESSID=(.*);',cook):
# cookie=re.search('PHPSESSID=(.*);', cook).group(1)
# if cookie:
# cookie=cookie
# yield SplashRequest(link, cookies={'__cfduid':cfduid,'PHPSESSID':cookie},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# else :
# pass
# else:
# yield Request(link, cookies={'__cfduid':cfduid},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# print(u_a)
yield Request(link, callback=self.parse_property, meta={'URL': link})
# yield Request(link, cookies={'__cfduid':cfduid},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# yield SplashRequest(link, self.parse_property,
# endpoint='execute',
# cache_args=['lua_source'],
# args={'lua_source': script},
# headers={'X-My-Header': 'value'},
# )
rel_next_page = response.xpath('//u[contains (text(), "Next")]/text()/ancestor::a/#href').extract_first()
next_page = response.urljoin(rel_next_page)
yield Request(next_page, callback=self.parse)
So far I have tried the above commented section of the code.
I can crawl few pages with the default settings, and increase crawls to over a 100 with delays set to 30 seconds.
I think the problem is with PHPSESSID being set "only once" at the very begining for every combination of proxy and user-agent, while the __cfduid is set for lifetime of the crawl for that combination.
I solved the problem using
scrapy cookiejar.
Here is the code to set new cookies for every new request
def parse(self, response):
properties = response.xpath('//*[#id="buzz"]/following-sibling::table')[0:-1]
for i, property in enumerate(properties):
links = property.xpath('.//#href').extract_first()
urls = response.urljoin(links)
yield Request(urls, callback=self.parse_property, meta={'URL': link, 'cookiejar': i})
rel_next_page = response.xpath('//u[contains (text(), "Next")]/text()/ancestor::a/#href').extract_first()
next_page = response.urljoin(rel_next_page)
for i, url in enumerate(next_page):
yield Request(next_page, callback=self.parse, meta={'cookiejar': i})
I am trying to download some images without compression.
e.g. http://p1.pstatp.com/origin/433c000159def0223671
this pic is about 2.0MB
when i download it using scrapy it's only 120Kb .
settings.py
BOT_NAME = 'toutiao'
SPIDER_MODULES = ['toutiao.spiders']
NEWSPIDER_MODULE = 'toutiao.spiders'
IMAGES_STORE = './images/'
MEDIA_ALLOW_REDIRECTS = True
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36X-Requested-With:XMLHttpRequest'
}
ITEM_PIPELINES = {'toutiao.pipelines.ToutiaoPipeline': 300,}
items.py
import scrapy
class ToutiaoItem(scrapy.Item):
keyword = scrapy.Field()
title = scrapy.Field()
urls = scrapy.Field()
spiders.py
import scrapy
from scrapy import Request
from toutiao.items import ToutiaoItem
from urllib.parse import urlencode
import json
import re
class ToutiaopicSpider(scrapy.Spider):
name = 'toutiaopic'
allowed_domains = ['toutiao.com']
keyword = '佳片欣赏·人像'
param={'offset': 0,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': '1'}
url = 'https://www.toutiao.com/search_content/?' + urlencode(param)
start_urls = [url,]
def parse(self, response):
if response.status == 200 :
data = json.loads(response.body.decode('utf-8'))
#yield Request(url=data.get('data')[0]['article_url'],callback=self.find_pic)
if 'data' in data.keys():
for item in data.get('data'):
url = item.get('article_url')
if url:
yield Request(url,callback=self.find_pic)
# get more
if self.param['offset'] < 20:
self.param['offset'] += 20
#print('data是',self.data['offset'])
url = 'https://www.toutiao.com/search_content/?' + urlencode(self.param)
yield Request(url,callback=self.parse)
def find_pic(self,response):
title = response.xpath('//title/text()').extract()[0]
html = response.body.decode('utf-8').replace('\\','')
if 'gallery: JSON.parse' in html:
images_pattern = re.compile('"url_list".*?"url":"(.*?)"},', re.S)
urls = re.findall(images_pattern, html)
else:
img_pattern = re.compile(r'"(http.*?)"',re.S)
urls = re.findall(img_pattern,html)
item = ToutiaoItem()
item['keyword'] = self.keyword
item['urls'] = urls
item['title'] = title
#print('打印item',item['image_urls'],item['title'])
yield item
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem
import re
class ToutiaoPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
self.item = item
for url in item['urls']:
self.index = 0
yield Request(url=url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
def file_path(self, request, response=None, info=None):
item = self.item
keyword = re.sub(r'[?\\*|“<>:/]', '',item['keyword'])
title = re.sub(r'[?\\*|“<>:/]', '',item['title'])
image_name = title+str(int(self.index))
self.index += 0.5
return '%s/%s.png' % (keyword,image_name)
I want to download the original picture. What should i do ?
I am new to scrapy, I have a base spider, similar to the example below:
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['example.com'] #the domain where the spider is allowed to crawl
start_urls = ['http://www.example.com/content/'] #url from which the spider will start crawling
page_incr = 1
flag = 0
def parse(self, response):
sel=Selector(response)
stuffs = sel.xpath('//a/#href')
for stuff in stuffs:
link = stuff.extract()
req1 = Request(url=link, callback=self.parse_item)
yield req1
url = 'http://www.example.com/content/?q=ajax//date/%d&page=%d' % (self.page_incr, self.page_incr)
req2 = Request(url=url,
headers={"Referer": "http://www.example.com/content", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse_xhr)
yield req2
def parse_xhr(self, response):
sel=Selector(response)
stuffs = sel.xpath('//a/#href')
for stuff in stuffs:
link = stuff.extract()
yield Request(url=link, callback=self.parse_item)
content = sel.xpath('//a/#href').extract()
if content == []:
self.flag +=1
if self.flag == 5:
raise CloseSpider('WARNING: <Spider forced to stop>')
else:
self.flag = 0
self.page_incr +=1
url = 'http://www.example.com/content/?q=ajax//date/%d&page=%d' % (self.page_incr, self.page_incr)
req3 = Request(url=url,
headers={"Referer": "http://www.example.com/content", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse_xhr)
yield req3
def parse_item(self, response):
pass
When I try to set it to crawl there is an error, this:
line 24, in parse
req1 = Request(url=link, callback=self.parse_item)
exceptions.AttributeError: 'MySpider' object has no attribute 'parse_item'
I am not getting it... Please help me seeing what is wrong!
Thanks for your time and help.
Your parse_item() method is incorrectly indented (with 5 spaces instead of 4).
I'm using Scrapy to collect some data and everything works fine except the email extraction part. For some reason email row in .csv file is blank or there is only a few emails extracted. I've tried limiting download_delay and CLOSESPIDER_ITEMCOUNT but it's not working. Any help is much appreciated.
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["hanford.craigslist.org"]
start_urls = [
"http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
]
BASE_URL = 'http://hanford.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/sdo/cto/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
First of all, a quote from Terms of Use as a warning:
USE. You agree not to use or provide software (except for general
purpose web browsers and email clients, or software expressly licensed
by us) or services that interact or interoperate with CL, e.g. for
downloading, uploading, posting, flagging, emailing, search, or mobile
use. Robots, spiders, scripts, scrapers, crawlers, etc. are
prohibited, as are misleading, unsolicited, unlawful, and/or spam
postings/email. You agree not to collect users' personal and/or
contact information ("PI").
Several things to fix here:
the contact information is under reply/hnf/cto/ instead of reply/sdo/cto/
specify User-Agent and X-Requested-With headers
The complete code that works for me:
import re
from urlparse import urljoin
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["hanford.craigslist.org"]
start_urls = [
"http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
]
BASE_URL = 'http://hanford.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = urljoin(self.BASE_URL, link)
yield scrapy.Request(absolute_url,
callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = urljoin(self.BASE_URL, "reply/hnf/cto/" + item_id)
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url,
meta={'item': item},
callback=self.parse_contact,
headers={"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36"})
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
every time i run my code my ip gets banned. I need help to delay each request for 10 seconds. I've tried to place DOWNLOAD_DELAY in code but it gives no results. Any help is appreciated.
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"https://washingtondc.craigslist.org/search/fua"
]
BASE_URL = 'https://washingtondc.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/nos/vgm/" + item_id
item = DmozItem()
item["link"] = response.url
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
You need to set DOWNLOAD_DELAY in settings.py of your project. Note that you may also need to limit concurrency. By default concurrency is 8 so you are hitting website with 8 simultaneous requests.
# settings.py
DOWNLOAD_DELAY = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 2
Starting with Scrapy 1.0 you can also place custom settings in spider, so you could do something like this:
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]
custom_settings = {
"DOWNLOAD_DELAY": 5,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2
}
Delay and concurrency are set per downloader slot not per requests. To actually check what download you have you could try something like this
def parse(self, response):
"""
"""
delay = self.crawler.engine.downloader.slots["www.dmoz.org"].delay
concurrency = self.crawler.engine.downloader.slots["www.dmoz.org"].concurrency
self.log("Delay {}, concurrency {} for request {}".format(delay, concurrency, response.request))
return