scrapy - download image without compressing the picture - python

I am trying to download some images without compression.
e.g. http://p1.pstatp.com/origin/433c000159def0223671
this pic is about 2.0MB
when i download it using scrapy it's only 120Kb .
settings.py
BOT_NAME = 'toutiao'
SPIDER_MODULES = ['toutiao.spiders']
NEWSPIDER_MODULE = 'toutiao.spiders'
IMAGES_STORE = './images/'
MEDIA_ALLOW_REDIRECTS = True
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36X-Requested-With:XMLHttpRequest'
}
ITEM_PIPELINES = {'toutiao.pipelines.ToutiaoPipeline': 300,}
items.py
import scrapy
class ToutiaoItem(scrapy.Item):
keyword = scrapy.Field()
title = scrapy.Field()
urls = scrapy.Field()
spiders.py
import scrapy
from scrapy import Request
from toutiao.items import ToutiaoItem
from urllib.parse import urlencode
import json
import re
class ToutiaopicSpider(scrapy.Spider):
name = 'toutiaopic'
allowed_domains = ['toutiao.com']
keyword = '佳片欣赏·人像'
param={'offset': 0,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': '1'}
url = 'https://www.toutiao.com/search_content/?' + urlencode(param)
start_urls = [url,]
def parse(self, response):
if response.status == 200 :
data = json.loads(response.body.decode('utf-8'))
#yield Request(url=data.get('data')[0]['article_url'],callback=self.find_pic)
if 'data' in data.keys():
for item in data.get('data'):
url = item.get('article_url')
if url:
yield Request(url,callback=self.find_pic)
# get more
if self.param['offset'] < 20:
self.param['offset'] += 20
#print('data是',self.data['offset'])
url = 'https://www.toutiao.com/search_content/?' + urlencode(self.param)
yield Request(url,callback=self.parse)
def find_pic(self,response):
title = response.xpath('//title/text()').extract()[0]
html = response.body.decode('utf-8').replace('\\','')
if 'gallery: JSON.parse' in html:
images_pattern = re.compile('"url_list".*?"url":"(.*?)"},', re.S)
urls = re.findall(images_pattern, html)
else:
img_pattern = re.compile(r'"(http.*?)"',re.S)
urls = re.findall(img_pattern,html)
item = ToutiaoItem()
item['keyword'] = self.keyword
item['urls'] = urls
item['title'] = title
#print('打印item',item['image_urls'],item['title'])
yield item
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem
import re
class ToutiaoPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
self.item = item
for url in item['urls']:
self.index = 0
yield Request(url=url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
def file_path(self, request, response=None, info=None):
item = self.item
keyword = re.sub(r'[?\\*|“<>:/]', '',item['keyword'])
title = re.sub(r'[?\\*|“<>:/]', '',item['title'])
image_name = title+str(int(self.index))
self.index += 0.5
return '%s/%s.png' % (keyword,image_name)
I want to download the original picture. What should i do ?

Related

Scrape ajax pages

I do not anything how to scrape ajax pages there is no pagination on website the website will be load by clicking the load more button these is the page link https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
import scrapy
from scrapy.http import Request
from selenium import webdriver
from scrapy_selenium import SeleniumRequest
import pandas as pd
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url="https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false",
wait_time=3,
screenshot=True,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
books = response.xpath("//h3[#class='card-Title\nbreak-word\nf3\nmb1\nmt0']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title = response.css(".mr3-m::text").get()
address = response.css(".showcase-address::text").get()
address=address.strip()
website = response.xpath("//li[#class='dib ml3 mr3']//a[starts-with(#href, 'http')]/#href").get()
website=website.strip()
phone = response.xpath("//li[#class='dib ml3 mr3'] //span[contains(text(), 'Phone:')]/following-sibling::text()").get()
phone=phone.strip().replace("-","")
yield{
'title':title,
'address':address,
'website':website,
'phone':phone
}
Okay, try the following script to get all the fields you wish to grab from there traversing all the exhibitor list:
import scrapy
from scrapy.selector import Selector
class MapYourShowSpider(scrapy.Spider):
name = "mapyourshow"
content_url = 'https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm'
inner_base = 'https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={}'
headers = {
'x-requested-with': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '557',
'start': '0',
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.content_url,
method='GET',
headers=self.headers,
formdata=self.params,
callback=self.parse,
)
def parse(self,response):
for item in response.json()['DATA']['results']['exhibitor']['hit']:
inner_link = self.inner_base.format(item['fields']['exhid_l'])
yield scrapy.Request(
url=inner_link,
headers=self.headers,
callback=self.parse_content,
)
def parse_content(self,response):
elem = response.json()['DATA']['BODYHTML']
sel = Selector(text=elem)
title = sel.css("h2::text").get()
try:
address = ' '.join([' '.join(i.split()) for i in sel.css("p.showcase-address::text").getall()])
except AttributeError: address = ""
website = sel.css("a[title*='website']::text").get()
phone = sel.xpath("normalize-space(//*[starts-with(#class,'showcase-web-phone')]/li[./*[.='Phone:']]/span/following::text())").get()
yield {"title":title,"address":address,"website":website,"phone":phone}
I have not used your code and did it rather my way (because I'm not a huge fan of selenium). But I hope this helps anyway:
import requests
import json
import time
from bs4 import BeautifulSoup
import re
headers = {
'x-requested-with': 'XMLHttpRequest',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '200', # don`t increase this too much (increase the start parameter instead and send a new request after some delay)
'start': '0',
}
response = requests.get('https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm', params=params, headers=headers)
data = json.loads(response.text)
all_sites = []
for exs in data["DATA"]["results"]["exhibitor"]["hit"]:
id = exs["fields"]["exhid_l"]
site = f"https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={id}"
all_sites.append(site)
for site in all_sites:
response = requests.get(site)
soup = BeautifulSoup(response.text, "html.parser")
info_box = soup.find("div", {"id":"showroomContentDiv"})
title = info_box.find("section", {"id":"scroll-description"}).text.strip().split("\n")[0][6:]
address = " ".join(info_box.find("p", {"class":"showcase-address"}).text.strip().split())
website = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[0].text.strip()
phone = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[1].text[7:].strip()
print(title)
print(address)
print(website)
print(phone)
# delay so you don't create too much traffic
time.sleep(1)

Trouble outputting data with Scrapy

I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()

How do I overwrite the file_path function in scrapy 1.7.3?

Without overwriting the file_path function, the spider download all the images with the default 'request URL hash' filenames. However when I try to overwrite the function it just doesn't work. There is nothing in the default output attribute, images.
I have tried both relative and absolute paths for the IMAGES_STORE variable in settings.py as well as the file_path function to no avail. Even when I overwrite the file_path function with the exact same default file_path function, the images do not download.
Any help would be much appreciated!
settings.py
BOT_NAME = 'HomeApp2'
SPIDER_MODULES = ['HomeApp2.spiders']
NEWSPIDER_MODULE = 'HomeApp2.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'
# ScrapySplash settings
SPLASH_URL = 'http://192.168.99.100:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'HomeApp2.pipelines.DuplicatesPipeline': 250,
'HomeApp2.pipelines.ProcessImagesPipeline': 251,
'HomeApp2.pipelines.HomeApp2Pipeline': 300,
}
IMAGES_STORE = 'files'
pipelines.py
import json
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class DuplicatesPipeline(object):
def __init__(self):
self.sku_seen = set()
def process_item(self, item, spider):
if item['sku'] in self.sku_seen:
raise DropItem("Repeated item found: %s" % item)
else:
self.sku_seen.add(item['sku'])
return item
class ProcessImagesPipeline(ImagesPipeline):
'''
def file_path(self, request):
print('!!!!!!!!!!!!!!!!!!!!!!!!!')
sku = request.meta['sku']
num = request.meta['num']
return '%s/%s.jpg' % (sku, num)
'''
def get_media_requests(self, item, info):
print('- - - - - - - - - - - - - - - - - -')
sku = item['sku']
for num, image_url in item['image_urls'].items():
yield scrapy.Request(url=image_url, meta = {'sku': sku,
'num': num})
class HomeApp2Pipeline(object):
def __init__(self):
self.file = open('items.jl', 'w')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + '\n'
self.file.write(line)
return item
AppScrape2.py
import scrapy
from scrapy_splash import SplashRequest
from HomeApp2.items import HomeAppItem
class AppScrape2Spider(scrapy.Spider):
name = 'AppScrape2'
def start_requests(self):
yield SplashRequest(
url = 'https://www.appliancesonline.com.au/product/samsung-sr400lstc-400l-top-mount-fridge?sli_sku_jump=1',
callback = self.parse,
)
def parse(self, response):
item = HomeAppItem()
product = response.css('aol-breadcrumbs li:nth-last-of-type(1) .breadcrumb-link ::text').extract_first().rsplit(' ', 1)
if product is None:
return {}
item['sku'] = product[-1]
item['image_urls'] = {}
root_url = 'https://www.appliancesonline.com.au'
product_picture_count = 0
for pic in response.css('aol-product-media-gallery-main-image-portal img.image'):
product_picture_count = product_picture_count + 1
item['image_urls']['p'+str(product_picture_count)] = (
root_url + pic.css('::attr(src)').extract_first())
feature_count = 0
for feat in response.css('aol-product-features .feature'):
feature_count = feature_count + 1
item['image_urls']['f'+str(feature_count)] = (
root_url + feat.css('.feature-image ::attr(src)').extract_first())
yield item
items.py
import scrapy
class HomeAppItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
sku = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pass
After much trial and error, I found the solution. It was simply adding the rest of the parameters to the file_path method.
Changing
def file_path(self, request):
to
def file_path(self, request, response=None, info=None):
It seems that the my original code overrode the method incorrectly causing calls to the method to fail.

Is there a way to bypass PHPSESSID and __cfduid cookies while using proxies and fake_useragent in scrapy?

import scrapy
from scrapy import Request
from NPM.items import NPMItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Join, Compose
import re
import cfscrape
from scrapy_splash import SplashRequest
# import requests
# session=requests.Session()
# print(session.cookies.getdict())
# response = session.get('http://google.com')
# print(session.cookies.get_dict())
script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
}
end
"""
class ExampleCrawler(scrapy.Spider):
name = 'Example'
custom_settings = {
'RETRY_TIMES': 5,
'DOWNLOAD_DELAY': 3,
'CONCURRENT_REQUESTS': 20,
'CONCURRENT_REQUESTS_PER_DOMAIN': 20,
'CONCURRENT_REQUESTS_PER_IP': 20,
'AUTOTHROTTLE_ENABLED': True,
'COOKIES_ENABLED': True,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36 ',
'PROXY_LIST': 'EXAMPLE/proxy.txt'
}
allowed_domains = ['example.com']
start_urls = ['https://example/real-estate/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'X-My-Header': 'value'},
)
def parse(self, response):
properties = response.xpath('//*[#id="mbuzz"]/following-sibling::table')[0:-1]
for property in properties:
links = property.xpath('.//#href').extract_first()
urlo = response.urljoin(links)
link = urlo.replace('/real-estate', '')
# head=response.headers
#
# token,u_a=cfscrape.get_tokens(link)
# cfduid=token['__cfduid']
#
# cook=response.headers.getlist('Set-Cookie')
# # HEAD=Request.meta
# cook=str(cook)
# if re.search('PHPSESSID=(.*);',cook):
# cookie=re.search('PHPSESSID=(.*);', cook).group(1)
# if cookie:
# cookie=cookie
# yield SplashRequest(link, cookies={'__cfduid':cfduid,'PHPSESSID':cookie},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# else :
# pass
# else:
# yield Request(link, cookies={'__cfduid':cfduid},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# print(u_a)
yield Request(link, callback=self.parse_property, meta={'URL': link})
# yield Request(link, cookies={'__cfduid':cfduid},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# yield SplashRequest(link, self.parse_property,
# endpoint='execute',
# cache_args=['lua_source'],
# args={'lua_source': script},
# headers={'X-My-Header': 'value'},
# )
rel_next_page = response.xpath('//u[contains (text(), "Next")]/text()/ancestor::a/#href').extract_first()
next_page = response.urljoin(rel_next_page)
yield Request(next_page, callback=self.parse)
So far I have tried the above commented section of the code.
I can crawl few pages with the default settings, and increase crawls to over a 100 with delays set to 30 seconds.
I think the problem is with PHPSESSID being set "only once" at the very begining for every combination of proxy and user-agent, while the __cfduid is set for lifetime of the crawl for that combination.
I solved the problem using
scrapy cookiejar.
Here is the code to set new cookies for every new request
def parse(self, response):
properties = response.xpath('//*[#id="buzz"]/following-sibling::table')[0:-1]
for i, property in enumerate(properties):
links = property.xpath('.//#href').extract_first()
urls = response.urljoin(links)
yield Request(urls, callback=self.parse_property, meta={'URL': link, 'cookiejar': i})
rel_next_page = response.xpath('//u[contains (text(), "Next")]/text()/ancestor::a/#href').extract_first()
next_page = response.urljoin(rel_next_page)
for i, url in enumerate(next_page):
yield Request(next_page, callback=self.parse, meta={'cookiejar': i})

scrapy.Reaquests() callback not work

There are lots of questions about this though, but most of people encounter this problem because of the "dont_filter' argument. I passed this argument "dont_filter = True" but my custom parse generator still didn't work. Here is my code(the third parser "parse_spec" had never been called, "parse_models_follow_next_page" just work well when called by parse(), but it can't call itself when it need to turn to next page):
import scrapy
from gsmarena.items import PhoneItems
class VendorSpider(scrapy.Spider):
custom_settings = {
'DOWNLOAD_DELAY': 1.5,
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A',
'COOKIES_ENABLED': False
}
name = "gsmarena_spec"
allowed_domains = ["https://www.gsmarena.com/"]
start_urls = [
"https://www.gsmarena.com/makers.php3"
]
def parse(self, response):
# print("Existing settings: %s" % self.settings.attributes.items())
length = len(response.xpath("//table//a").extract())
for i in range(1, length):
brand = response.xpath(
'(//table//a)[{}]/text()'.format(i)).extract()[0]
url = "https://www.gsmarena.com/" + \
response.xpath("(//table//a)[{}]/#href".format(i)).extract()[0]
yield scrapy.Request(url, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)
def parse_models_follow_next_page(self, response):
brand = response.meta.get('brand')
length = len(response.xpath(
"//div[#class='makers']/self::div//a").extract())
for i in range(1, length):
url = "https://www.gsmarena.com/" + \
response.xpath(
"(//div[#class='makers']/self::div//a)[{}]/#href".format(i)).extract()[0]
model = response.xpath(
"(//div[#class='makers']/self::div//a//span/text())[{}]".format(i)).extract()[0]
yield scrapy.Request(url, callback=self.parse_spec, meta={'brand': brand, 'model': model}, dont_filter=True)
is_next_page = response.xpath(
"//a[#class=\"pages-next\"]/#href").extract()
if is_next_page:
next_page = "https://www.gsmarena.com/" + is_next_page[0]
yield scrapy.Request(next_page, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)
def parse_spec(self, response):
item = PhoneItems()
item['model'] = response.meta.get('model')
item['brand'] = response.meta.get('brand')
for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
item[spec_name] = spec
yield item
and sorry for my bad English.
You scraper has few issues.
allowed_domains = ["https://www.gsmarena.com/"]
should be
allowed_domains = ["www.gsmarena.com"]
Next you don't have errback_httpbin method defined in your class
def errback_httpbin(self, response):
pass
Below code
for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
should be
for spec_name, spec in zip(response.xpath('//table//td[1]/text()').extract(), response.xpath('//table//td[2]/text()').extract()):
This though still has some issues.
Also your code would take some time for the first yield, as the scheduler will pick url based on the order of urls coming in
I have made some changes in the code and it scraps all the results expect the spec_name, that is not specified in an understanding way.
import scrapy
from lxml import html
from tutorial.items import PhoneItems
class VendorSpider(scrapy.Spider):
custom_settings = {
'DOWNLOAD_DELAY': 1.5,
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)
AppleWebKit/537.75.14 (KHTML, '
'like Gecko) Version/7.0.3 Safari/7046A194A',
'COOKIES_ENABLED': False
}
name = "gsmarena_spec"
allowed_domains = ["https://www.gsmarena.com/"]
start_urls = [
"https://www.gsmarena.com/makers.php3"
]
def parse(self, response):
# print("Existing settings: %s" %
self.settings.attributes.items())
length = len(response.xpath("//table//a").extract())
for i in range(1, length):
brand = response.xpath(
'(//table//a)[{}]/text()'.format(i)).extract()[0]
url = "https://www.gsmarena.com/" + \
response.xpath("(//table//a)
[{}]/#href".format(i)).extract()[0]
yield scrapy.Request(url,
callback=self.parse_models_follow_next_page,
meta={'brand':brand},dont_filter=True)
def parse_models_follow_next_page(self, response):
brand = response.meta.get('brand')
meta = response.meta
doc = html.fromstring(response.body)
single_obj = doc.xpath('.//div[#class="makers"]/ul//li')
for obj in single_obj:
url = self.allowed_domains[0]+obj.xpath('.//a/#href')[0]
meta['brand'] = obj.xpath('.//a/#href')[0].split('_')[0]
meta['model'] = obj.xpath('.//a/#href')[0]
yield scrapy.Request(url=url, callback=self.parse_spec,
meta=meta, dont_filter=True)
is_next_page = response.xpath(
"//a[#class=\"pages-next\"]/#href").extract()
if is_next_page:
next_page = "https://www.gsmarena.com/" + is_next_page[0]
yield scrapy.Request(next_page,
callback=self.parse_models_follow_next_page,
meta={'brand': brand},dont_filter=True)
def parse_spec(self, response):
item = PhoneItems()
meta = response.meta
item['model'] = meta['model']
item['brand'] = meta['brand']
#Need to specify details about spec_name
# for spec_name, spec in
#zip(response.xpath('//table//td[1]').extract(),
# response.xpath('//table//td[2]').extract()):
# item[spec_name] = spec
yield item

Categories

Resources