Why scrapy not storing data into mongodb?

Why scrapy not storing data into mongodb? - python

My main File:
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
class Product(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
heading = scrapy.Field()
data = scrapy.Field()
Model_name = scrapy.Field()
class aqaqspider(CrawlSpider):
name = "mouth_shut_new"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/mobile-phones/Yu-Yureka-reviews-925723476"
]
rules = (
Rule(
SgmlLinkExtractor(allow=('.*\-page-.*',)),
callback="parse_start_url",
follow=True),
)
def parse_start_url(self, response):
products = response.xpath('//div[#id="allreviews"]/ul/li')
items = []
if not products:
raise CloseSpider("No more products!")
for product in products:
item = Product()
#item['Model_name'] = product.xpath('/html/body/form/div[12]/div/div[5]/div/div[1]/div[3]/ul/li[1]/h1/a/span/text()').extract()
item['name'] = product.xpath('.//li[#class="profile"]/div/a/span/text()').extract()[0]
item['title'] = product.xpath('.//div[#class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = product.xpath('.//div[#class="reviewrate"]//span[#class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = product.xpath('.//div[#class="reviewtitle fl"]/strong/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//div[#itemprop="description"]/p/text()').extract()
yield old_item
# yield Request(url="http://www.mouthshut.com/Product/mobileListing.aspx?cid=925602729&f1=1&view=list&nsort1=0&nsort2=2015-06-01%2016:12:23.000&ntype=3&mpad=1&ran=0.3691624044781373&dcal=Intex%20Aqua%20Xtreme" ,
# headers={"Referer": "http://www.mouthshut.com/mobile-phones.php", "X-Requested-With": "XMLHttpRequest"},
# callback=self.parse,
# dont_filter=True)
My settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for mouth project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'mouth'
SPIDER_MODULES = ['mouth.spiders']
NEWSPIDER_MODULE = 'mouth.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mouth (+http://www.yourdomain.com)'
ITEM_PIPELINES = {'mouth.pipelines.MongoDBPipeline':300}
MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "mobiles_complaints" # Change in prod
MONGODB_COLLECTION = "Yu_Yureka"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'consumer (+http://www.yourdomain.com)'
My pipelines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.conf import settings
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item
I ran scrapy scrapy crawl mouth_shut_new. But my data didn't store in the database. In the output it should show that the data is stored in mongo and the collection name. What I am missing?

process_item() method is not indented properly, should be:
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item

You didn't yield the item in callback function: callback="parse_start_url", You should do it like this:
def parse_start_ul(self, response):
...
for product in products:
item = Product()
....
yield item

Related

Scrapy Pipeline with MongoDB is not working

I'm trying to put all the data that I'm scraping in MongoDB to monitor the properties prices.
I've already done a lot of tests, but it's not working.
I will put the code here, if anyone could help me. Please.
init.py
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'realstatedata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[#title="Próxima página"]/#href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
#print(path)
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
#print(path)
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[#class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[#class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
item['address'] = resource.xpath('.//span[#class="property-card__address"]/text()').extract_first()
item['prop_area'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
item['prop_rooms'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_bath'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_parking'] = resource.xpath('.//ul/li[4]/span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['price_rent'] = resource.xpath('.//p[#style="display: block;"]/text()').extract_first()
item['price_cond'] = resource.xpath('.//strong[#class="js-condo-price"]/text()').extract_first()
item['realstate_name'] = resource.xpath('.//picture/img/#alt').extract_first()
yield item
settings.py
BOT_NAME = 'realstatedata'
SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'
ITEM_PIPELINES = ['realstatedata.pipelines.MongoPipeline', ]
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"
items.py
import scrapy
class RealstatedataItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
description = scrapy.Field()
address = scrapy.Field()
prop_area = scrapy.Field()
prop_rooms = scrapy.Field()
prop_bath = scrapy.Field()
prop_parking = scrapy.Field()
price_rent = scrapy.Field()
price_cond = scrapy.Field()
realstate_name = scrapy.Field()
pass
pipeline.py
in this part of the code, i've tried two differents forms to do, but none works.
import pymongo
import logging
class MongoPipeline(object):
collection_name = 'rent_properties'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
self.db[self.collection_name].insert(dict(item))
logging.debug("Properties added to MongoDB")
return item

Obviously your settings for enabling pipeline is wrong. ITEM_PIPELINES should be defined as a dict but not a list. In your code, the pipeline is not loaded at all.
ITEM_PIPELINES = {
"realstatedata.pipelines.MongoPipeline": 100,
}
The value in the dict represents priority when more than 1 pipeline are enabled.

I implemented an ItemLoader in my scrapy project to format that data and it is no longer adding anything to the csv file

I create a scrapy project to scrape a few information off this classifieds website, however the data I was getting needed to be formatted. After doing some research I figured out how to implement an ItemLoader but now it does not write any scraped data to the csv file.
Here's my spider.py:
import scrapy
from..items import TestItem
from scrapy.loader import ItemLoader
class TestSpiderSpider(scrapy.Spider):
name = 'test'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
items = TestItem()
product_title = car.css('.jco-card-title::text').extract()
product_imagelink = car.css('.card-image img::attr(data-src)').getall()
urls = car.css('.card-image a::attr(href)').getall()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
if product_title and product_imagelink:
items['urls'] = urls
def parse_details(self, response):
l= ItemLoader(item=TestItem(), selector=response)
l.add_css('product_title','#title::text')
yield l.load_item()
pass
Here's my items.py
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags
class TestItem(scrapy.Item):
product_title = scrapy.Field(input_processors= MapCompose(remove_tags),output_processor= TakeFirst())
pass
Here's my setting.py:
BOT_NAME = 'test'
SPIDER_MODULES = ['test.spiders']
NEWSPIDER_MODULE = 'test.spiders'
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
}
Here's my pipeline.py:
class TestPipeline:
def process_item(self, item, spider):
return item

You don't need pipelines enabled to use ItemLoader, try without.

Storing the scraped data in MongoDB

I want to store the scraped data in MongoDb, but I am getting an error.
File "C:\Pythom27\lib\site-packages\six.py", line 599 , in iteritems
return d.iteritems(**kw)
AttributeError: 'list' object has no attribute 'iteritem'.
I have not used attribute has iteritem anywhere in the program
Here is the program code:
ex.py
import scrapy
from example.items import ExampleItem
class ExampleSpider(scrapy.Spider):
name = 'aaa'
allowed_domains = ["in.bookmyshow.com"]
start_urls = ["https://in.bookmyshow.com/movies"]
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
item = {}
item['Moviename'] = map(unicode.strip, response.xpath('.//h1[#id="eventTitle"]/text()').extract())
item['Language'] = map(unicode.strip, response.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/span[1]/a/text()').extract())
item['Info'] = map(unicode.strip, response.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/span[3]/a/text()').extract())
yield item
settings.py:
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
ITEM_PIPELINES = ['example.pipelines.MongoDBPipeline', ]
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "ticketbook"
MONGODB_COLLECTION = "movies"
pipleline.py
import pymongo
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class ExamplePipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item
I would like to know where i have gone wrong..

In your settings.py, change the ITEMS_PIPELINES from a list to a dictionary like so:
ITEM_PIPELINES = { 'example.pipelines.MongoDBPipeline': 100 }
See explanation: http://doc.scrapy.org/en/latest/topics/item-pipeline.html#activating-an-item-pipeline-component

DropItem if parsed url contains key words (pipeline)

I am trying to build a spider for a school project where I am scraping recipes from allrecipes.com. Everything is working really well, however I seem to be unable to remove duplicate recipes where one url contains the actual recipe, and the other contains the same url with "video=true" appended.
Here is my attempt to dealing with this in pipelines.py:
from scrapy.exceptions import DropItem
from scrapy import log
class DuplicatesPipeline(object):
# minCal = 50
def __init__(self):
self.urls_seen = set()
def process_vids(self, item, spider):
video = "video=true"
url = str(item.get('url'))
if video in url:
raise DropItem("Contains video")
else:
return item
def process_item(self, item, spider):
unique_id = item.get('url')
if unique_id in self.urls_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.urls_seen.add('url')
return item
settings.py:
# Scrapy settings for dirbot project
BOT_NAME = 'dirbot'
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
ITEM_PIPELINES = {'dirbot.pipelines.DuplicatesPipeline': 300,}
items.py:
from scrapy.item import Item, Field
class Website(Item):
name = Field()
url = Field()
description = Field()
kcal = Field()
carbs = Field()
fat = Field()
protein = Field()
main = Field()
sugar = Field()
fibre = Field()
author = Field()
rating = Field()
img = Field()
dnot.py:
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
import urlparse
import scrapy
page = "http://allrecipes.com/recipes/main.aspx?Page=%d#recipes"
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["allrecipes.com"]
start_urls = [page % 1]
rules = [Rule(SgmlLinkExtractor(allow=('allrecipes.com'), restrict_xpaths = '//a[contains(.,"NEXT")]'),
callback="parse", follow= True),
]
def __init__(self):
self.page_number = 1
def parse(self, response):
print "-------------------------------------------------"
print self.page_number
print "-------------------------------------------------"
sel = Selector(response)
sites = response.xpath('//div[#id="divGridItemWrapper"]')
items = []
for site in sites:
item = Website()
recipe = response.xpath('//a[contains(#href, "/Recipe/")]/#href').extract()
url = "http://www.allrecipes.com"
for nth in recipe:
go = urlparse.urljoin(url, str(nth))
items.append(item)
for link in go:
yield Request(go, self.recipes)
if self.page_number <= 3:
self.page_number += 1
yield Request(page % self.page_number)
else:
pass
def recipes(self,response):
item = Website()
sel = Selector(response)
recipe = response.xpath('//div[#id="content-wrapper"]')
items = []
print "second page - %s" % response.url
for i in recipe:
item['url'] = response.url
item['description'] = i.xpath('//span[#itemprop="description"]/text()').extract()
item['name'] = i.xpath('//h1[#itemprop="name"]/text()').extract()
item['kcal'] = i.xpath('//ul/li[contains(.,"kcal")]/span/text()').extract()
item['carbs'] = i.xpath('//ul/li[contains(.,"Carbohydrates")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['fat'] = i.xpath('//ul/li[contains(.,"Fat")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['protein'] = i.xpath('//ul/li[contains(.,"Protein")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['main'] = "allrecipes.com"
item['sugar'] = i.xpath('//li/span[#itemprop="sugarContent"]/text()').extract()
item['fibre'] = i.xpath('//li/span[#itemprop="proteinContent"]/text()').extract()
item['author'] = i.xpath('//span[#id="lblUser0"]/text()').extract()
item['rating'] = i.xpath('//div[#class="rating-stars-img"][1]/meta[1][#itemprop="ratingValue"]/#content').extract()
item['img'] = i.xpath('//img[#id="imgPhoto"]/#src').extract()
items.append(item)
yield item
I am a little new with Python, and I'm not sure if I need to convert the item['url'] into a string or not; however I have tried with the "str" and without. I have also tried a few other methods that others have used for doing something similar, but nothing has worked for me so far.
Hoping someone can point me in the right direction. Thanks in advance!
Example:
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1&video=true

You need to create a class that implements the process_item method on the pipelines.py file, something like:
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
url = item['url']
u = urlparse(url)
query = parse_qs(u.query)
query.pop('video', None)
u = u._replace(query=urlencode(query, True))
unique_id = urlunparse(u)
if unique_id and unique_id in self.ids_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.ids_seen.add(unique_id)
return item
Then you need to add that class, to settings.py
ITEM_PIPELINES = {
'yourproject.pipelines.DuplicatesPipeline': 300,
}
Also, your process_vids method isn't being used.
let me know if it helps you.

Crawling images fails

I am trying to crawl images from a website with the following scrapy code:
import urlparse
from PIL import Image
from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.contrib.pipeline.images import ImagesPipeline
from mobile.items import Website
class MobileSpider(CrawlSpider):
name = "mobile"
allowed_domains = ["mobile-store.ro"]
start_urls = ["http://www.mobile-store.ro/produse/"]
rules = (
Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
)
def parse(self, response, response2):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//ul[#class='products']/li/a/#href").extract()
if not not next_page:
yield Request(next_page[0], self.parse)
sites = hxs.select('//div[#id="wrapper"]/div[#id="content"]')
items = []
for site in sites:
item = Website()
item['nume'] = site.select('//div[#class="summary"]/h1[#class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[#class="woocommerce_tabs"]/div[#id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
#item['image_urls'] = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['pret'] = site.select('//div[#class="summary"]/div[1]/p[#class="price"]/span[#class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
settings.py:
SPIDER_MODULES = ['mobile.spiders']
NEWSPIDER_MODULE = 'mobile.spiders'
DEFAULT_ITEM_CLASS = 'mobile.items.Website'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
items.py:
from scrapy.item import Item, Field
class Website(Item):
nume = Field()
descriere = Field()
categorie = Field()
brand = Field()
pret = Field()
url = Field()
image_urls = Field()
images = Field()
image_paths = Field()
pipelines.py:
from mobile.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
The issues comes when I try to get the images url, by using the following code:
for site in sites:
item = Website()
item['nume'] = site.select('//div[#class="summary"]/h1[#class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[#class="summary"]/div[#class="product_meta"]/span[#class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[#class="woocommerce_tabs"]/div[#id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url2,image_relative_url)]
#item['image_urls'] = site.select('//div[#class="ad-image-wrapper"]/div[#class="ad-image"]/img[#class="lightbox"]/#src').extract()
item['pret'] = site.select('//div[#class="summary"]/div[1]/p[#class="price"]/span[#class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
Which returns me the page url instead of the image url. All other fields are crawled correctly. Any clues on how to fix this issue and get the image url properly?

This is because the image (and the whole content of ad-image-wrapper div) is filled dynamically via javascript.
Dumping response.body in parse method helped me to figure out that the actual image link is originally kept in the ad-thumb-list list. So, try use the following for getting the image url:
image_relative_url = site.select('//ul[#class="ad-thumb-list"]/li[#class="first_item"]/a/#href').extract()
if image_relative_url:
image_relative_url = image_relative_url[0]
Hope that is what you needed.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Why scrapy not storing data into mongodb? - python

You didn't yield the item in callback function: callback="parse_start_url", You should do it like this: def parse_start_ul(self, response): ... for product in products: item = Product() .... yield item

Related

Scrapy Pipeline with MongoDB is not working

I implemented an ItemLoader in my scrapy project to format that data and it is no longer adding anything to the csv file

Storing the scraped data in MongoDB

DropItem if parsed url contains key words (pipeline)

Crawling images fails

Categories

Resources