I am new to python and web scraping and I tried storing the scrapy data to a csv file however the output is not satisfactory.
Current csv output:
Title Image
Audi,Benz,BMW Image1,Image2,Image3
how i would like to view it in a csv file:
Title Image
Audi Image1
Benz Image2
BMW Image3
this is what is type in the terminal to run it:
scrapy crawl testscraper -t csv -o test.csv
Here's the spider.py:
class TestSpiderSpider(scrapy.Spider):
name = 'testscraper'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
items = scrapeItem()
product_title = response.css('.jco-card-title::text').extract()
product_imagelink = response.css('.card-image img::attr(data-src)').getall()
items['product_title'] = product_title
items['product_imagelink'] = product_imagelink
items.append('items')
yield items
He's the code for items.py:
class scrapeItem(scrapy.Item):
product_title = scrapy.Field()
product_imagelink = scrapy.Field()
pass
You can select every div element that contains a car and then iterate over those elements, yielding them one by one.
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
item = scrapeItem()
product_title = car.css('.jco-card-title::text').get()
product_imagelink = car.css('.card-image img::attr(data-src)').get()
# Some of the elements don't contain a title or a image_link, like ads for example.
if product_title and product_imagelink:
item['product_title'] = product_title.strip().replace('\n', '')
item['product_imagelink'] = product_imagelink
yield item
Related
I am trying to scrape data using Scrapy.
All Parts data are extracted except the Product Image URL.
When trying to extract the Image URL It returns a List of Empty Strings as Shown in the below Image
Project Code
menscloths.py (Spider)
import scrapy
from ..items import DataItem
class MensclothsSpider(scrapy.Spider):
name = 'menscloths'
next_page=2
start_urls = ['https://www.example.com/clothing-and-accessories/topwear/pr?sid=clo%2Cash&otracker=categorytree&p%5B%5D=facets.ideal_for%255B%255D%3DMen&page=1']
def parse(self, response):
items=DataItem()
products=response.css("div._1xHGtK")
for product in products:
name = product.css(".IRpwTa::text").extract()
brand = product.css("._2WkVRV::text").extract()
original_price = product.css("._3I9_wc::text").extract()[1]
sale_price = product.css("._30jeq3::text").extract()[0][1:]
image_url = product.css("._2r_T1I::attr('src')").extract()
product_page_url = "https://www.example.com"+product.css("._2UzuFa::attr('href')").extract()[0]
product_category = "men topwear"
items["name"]=name
items["brand"]=brand
items["original_price"]=original_price
items["sale_price"]=sale_price
items["image_url"]=image_url
items["product_page_url"]=product_page_url
items["product_category"]=product_category
yield items
item.py
import scrapy
class DataItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
brand = scrapy.Field()
original_price = scrapy.Field()
sale_price = scrapy.Field()
image_url = scrapy.Field()
product_page_url = scrapy.Field()
product_category = scrapy.Field()
setting.py
BOT_NAME = 'scraper'
SPIDER_MODULES = ['scraper.spiders']
NEWSPIDER_MODULE = 'scraper.spiders'
ITEM_PIPELINES = {
'scraper.pipelines.ScraperPipeline': 300,
}
Thank in advance
I've seen this happen multiple times before. If you look closely at the images when you load the page, you can see that the image appears after a bit of time (even though, at least for me, the time it takes to load is about 1 second). However, your code is just loading the page and then trying to get the images, not waiting for the images to load in. You need some sort of wait function in order to wait for the images to load, and then get the images.
I'm currently working on a Scrapy code that will extract 3 types of data for each product. I called the data "title, price, and upc". For each product I have made my program able to scrape title and price correctly but i am having trouble scraping for upc since the upc is on another page.
What I want my program to do for each product, is to extract the title and price on the mainpage, then go inside another page to extract UPC code. Once it gets the upc code, I want the program to go to the next product on main page and repeat the same method for the remaining products.
Here is my code.
import scrapy
from scrapy.utils.response import open_in_browser
from ..items import QuotetutorialItem
data={hidden}
headers={hidden}
class BrickseekSpider(scrapy.Spider):
name = 'brickseek1'
allowed_domains = ['brickseek.com']
def start_requests(self):
dont_filter = True
yield scrapy.http.FormRequest(url='https://brickseek.com/login/', headers=headers, formdata=data,
callback=self.parse)
def parse(self, response):
items = QuotetutorialItem()
products = response.css('div.item-list__tile')
for product in products:
title = product.css('.item-list__title span::text').extract()
price = product.css('.item-list__price-column--highlighted .price-formatted__dollars::text').extract()
#another_page = response.css('div.item-list__tile a::attr(href)').get()
#if another_page:
#upc = product.css('div.item-overview__meta-item::text').extract()[6]
#yield response.follow(another_page, callback=self.parse)
items['title'] = title
items['price'] = price
#items['upc'] = upc
yield items
All you need to do is to put your item (after filling title,price) in meta when you visit the next page (assuming you css selectors are correct)
def parse(self, response):
items = QuotetutorialItem()
products = response.css('div.item-list__tile')
for product in products:
item = QuotetutorialItem()
item['title'] = product.css('.item-list__title span::text').extract()
item['price'] = product.css('.item-list__price-column--highlighted .price-formatted__dollars::text').extract()
another_page = response.css('div.item-list__tile a::attr(href)').get()
if another_page:
yield response.follow(another_page, callback=self.parse_upc,meta={'item':item})
else:
yield item
def parse_upc(self,response):
item=response.meta['item']
item['upc'] = product.css('div.item-overview__meta-item::text').extract()[6]
yield item
I have a simple spider that crawls local obituaries. The code works perfectly until I try to add two static columns. All I want to do is add the date I pulled the information (pull item) and the state in which it was pulled (state item). It's a self loading page so when I add the pull date, I only get the first 10 results (or only the first page). If I add just the state, I only get two results. When I remove both, I get all 40+ results.
I did # lines that aren't working properly:
Item.py file:
import scrapy
class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
#pull = scrapy.Field()
#state = scrapy.Field()
spider file:
import scrapy
import time
from al.items import AlItem
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browse?type=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
#pull = time.strftime("%m/%d/%Y")
#state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
#new_item['pull'] = pull
#new_item["state"] = state
yield new_item
I explain why:
if you paste in here for item in zip(name, link, obit, news): pull & state, then you will get the number of iterations equal 2 because state = "AL" - string variable. ZIP function get from state two chars and set iteration = 2 for all arguments in loop. zip gets the smallest numb from arguments for iteration. as with the date 01/01/2001 - 10 chars. (will iterations equal 10)
WILL WORKING:
`class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
pull = scrapy.Field()
state = scrapy.Field()`
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browsetype=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
pull = time.strftime("%m/%d/%Y")
state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
new_item['pull'] = pull
new_item["state"] = state
yield new_item
I'm attempting to scrape a website to get a very rough demographic of it's users (no personally identifying information or photos), but the tutorial spider from the official documentation I've modified is repeating the same line of output 4 times in a row.
A copy of the code I'm using is below:
Note that the example profile I've included in the code is a fake/spam account. In the case where it may have already been deleted, you can replace the url with any other on the site and it will work again.
import scrapy
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
for container in response.xpath('//div[#class="user-details-wide"]'):
yield {
'Gender': response.xpath("//span[#id='gender']/text()").extract_first(),
'Age': response.xpath("//span[#id='age']/text()").extract_first(),
'State': response.xpath("//span[#id='state_id']/text()").extract_first(),
'Marital status': response.xpath("//span[#id='maritalstatus']/text()").extract_first(),
'Body': response.xpath("//span[#id='body']/text()").extract_first(),
'Height': response.xpath("//span[#id='height']/text()").extract_first(),
'Ethnicity': response.xpath("//span[#id='ethnicity']/text()").extract_first(),
'Does drugs?': response.xpath("//span[#id='drugs']/text()").extract_first(),
'Smokes?': response.xpath("//span[#id='smoke']/text()").extract_first(),
'Drinks?': response.xpath("//span[#id='drink']/text()").extract_first(),
'Has children?': response.xpath("//span[#id='haschildren']/text()").extract_first(),
'Wants children?': response.xpath("//span[#id='wantchildren']/text()").extract_first(),
'Star sign': response.xpath("//span[#id='zodiac']/text()").extract_first(),
'Education': response.xpath("//span[#id='college_id']/text()").extract_first(),
'Personality': response.xpath("//span[#id='fishtype']/text()").extract_first(),
}
Running as follows:
scrapy crawl date -o date.scv
The output I'm looking for is one row of headers followed by one line of results straight after it, not the whitespace and duplicates I'm currently getting.
You don't need to use for loop. Simply find a span element and extract all data from him.
Also, I suggest you use scrapy items it's more convenient.
One way to clean extracted data from whitespace is to use xpath function normalize-space().
import scrapy
from items import DateSpiderItem
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
item = DateSpiderItem()
item['Gender'] = response.xpath(
"//span[#id='gender']/text()").extract_first()
item['Age'] = response.xpath(
"//span[#id='age']/text()").extract_first()
item['State'] = response.xpath(
"//span[#id='state_id']/text()").extract_first()
item['Marital_status'] = response.xpath(
"normalize-space(//span[#id='maritalstatus']/text())").extract_first()
item['Body'] = response.xpath(
"//span[#id='body']/text()").extract_first()
item['Height'] = response.xpath(
"//span[#id='height']/text()").extract_first()
item['Ethnicity'] = response.xpath(
"//span[#id='ethnicity']/text()").extract_first()
item['Does_drugs'] = response.xpath(
"normalize-space(//span[#id='drugs']/text())").extract_first()
item['Smokes'] = response.xpath(
"//span[#id='smoke']/text()").extract_first()
item['Drinks'] = response.xpath(
"normalize-space(//span[#id='drink']/text())").extract_first()
item['Has_children'] = response.xpath(
"normalize-space(//span[#id='haschildren']/text())").extract_first()
item['Wants_children'] = response.xpath(
"normalize-space(//span[#id='wantchildren']/text())").extract_first()
item['Star_sign'] = response.xpath(
"//span[#id='zodiac']/text()").extract_first()
yield item
Items file:
class DateSpiderItem(scrapy.Item):
Gender = scrapy.Field()
Age = scrapy.Field()
State = scrapy.Field()
Marital_status = scrapy.Field()
Body = scrapy.Field()
Height = scrapy.Field()
Ethnicity = scrapy.Field()
Does_drugs = scrapy.Field()
Smokes = scrapy.Field()
Drinks = scrapy.Field()
Has_children = scrapy.Field()
Wants_children = scrapy.Field()
Star_sign = scrapy.Field()
Education = scrapy.Field()
Personality = scrapy.Field()
Output:
I need Scrapy to collect data from this tag and retrieve all three parts in one piece. The output would be something like:
Tonka double shock boys bike - $10 (Denver).
<span class="postingtitletext">Tonka double shock boys bike - <span class="price">$10</span><small> (Denver)</small></span>
Second is to collect data from first span tag. So the result would be only:
2016 2004 Pontiac Grand Prix gt.
<p class="attrgroup"><span><b>2016 2004 Pontiac Grand Prix gt</b></span> <span>odometer: <b>164</b></span> <span>fuel : <b>gas</b></span> <span>transmission : <b>automatic</b></span> <span>title status : <b>clean</b></span></p>
Here is my code so far:
# -*- coding: utf-8 -*-
# scrapy crawl dmoz -o items.csv -t csv
import re
import scrapy
from scrapy.http import Request
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://jxn.craigslist.org/search/cto?"
]
BASE_URL = 'http://jxn.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/nos/vgm/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"]=response.xpath("//p[#class='attrgroup']/span/b/text()").extract()
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
For posting title, get all the text nodes from the span tag and join them:
$ scrapy shell http://denver.craigslist.org/bik/5042090428.html
In [1]: "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
Out[1]: u'Tonka double shock boys bike - $10 (Denver)'
Note that the "Scrapy-way" to do this would be to use an ItemLoader and the Join() processor.
Second is to collect data from first span tag.
Since you haven't provided an example input data, here is an educated guess:
response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0]