Why is my Scrapy spider duplicating it's output? - python

I'm attempting to scrape a website to get a very rough demographic of it's users (no personally identifying information or photos), but the tutorial spider from the official documentation I've modified is repeating the same line of output 4 times in a row.
A copy of the code I'm using is below:
Note that the example profile I've included in the code is a fake/spam account. In the case where it may have already been deleted, you can replace the url with any other on the site and it will work again.
import scrapy
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
for container in response.xpath('//div[#class="user-details-wide"]'):
yield {
'Gender': response.xpath("//span[#id='gender']/text()").extract_first(),
'Age': response.xpath("//span[#id='age']/text()").extract_first(),
'State': response.xpath("//span[#id='state_id']/text()").extract_first(),
'Marital status': response.xpath("//span[#id='maritalstatus']/text()").extract_first(),
'Body': response.xpath("//span[#id='body']/text()").extract_first(),
'Height': response.xpath("//span[#id='height']/text()").extract_first(),
'Ethnicity': response.xpath("//span[#id='ethnicity']/text()").extract_first(),
'Does drugs?': response.xpath("//span[#id='drugs']/text()").extract_first(),
'Smokes?': response.xpath("//span[#id='smoke']/text()").extract_first(),
'Drinks?': response.xpath("//span[#id='drink']/text()").extract_first(),
'Has children?': response.xpath("//span[#id='haschildren']/text()").extract_first(),
'Wants children?': response.xpath("//span[#id='wantchildren']/text()").extract_first(),
'Star sign': response.xpath("//span[#id='zodiac']/text()").extract_first(),
'Education': response.xpath("//span[#id='college_id']/text()").extract_first(),
'Personality': response.xpath("//span[#id='fishtype']/text()").extract_first(),
}
Running as follows:
scrapy crawl date -o date.scv
The output I'm looking for is one row of headers followed by one line of results straight after it, not the whitespace and duplicates I'm currently getting.

You don't need to use for loop. Simply find a span element and extract all data from him.
Also, I suggest you use scrapy items it's more convenient.
One way to clean extracted data from whitespace is to use xpath function normalize-space().
import scrapy
from items import DateSpiderItem
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
item = DateSpiderItem()
item['Gender'] = response.xpath(
"//span[#id='gender']/text()").extract_first()
item['Age'] = response.xpath(
"//span[#id='age']/text()").extract_first()
item['State'] = response.xpath(
"//span[#id='state_id']/text()").extract_first()
item['Marital_status'] = response.xpath(
"normalize-space(//span[#id='maritalstatus']/text())").extract_first()
item['Body'] = response.xpath(
"//span[#id='body']/text()").extract_first()
item['Height'] = response.xpath(
"//span[#id='height']/text()").extract_first()
item['Ethnicity'] = response.xpath(
"//span[#id='ethnicity']/text()").extract_first()
item['Does_drugs'] = response.xpath(
"normalize-space(//span[#id='drugs']/text())").extract_first()
item['Smokes'] = response.xpath(
"//span[#id='smoke']/text()").extract_first()
item['Drinks'] = response.xpath(
"normalize-space(//span[#id='drink']/text())").extract_first()
item['Has_children'] = response.xpath(
"normalize-space(//span[#id='haschildren']/text())").extract_first()
item['Wants_children'] = response.xpath(
"normalize-space(//span[#id='wantchildren']/text())").extract_first()
item['Star_sign'] = response.xpath(
"//span[#id='zodiac']/text()").extract_first()
yield item
Items file:
class DateSpiderItem(scrapy.Item):
Gender = scrapy.Field()
Age = scrapy.Field()
State = scrapy.Field()
Marital_status = scrapy.Field()
Body = scrapy.Field()
Height = scrapy.Field()
Ethnicity = scrapy.Field()
Does_drugs = scrapy.Field()
Smokes = scrapy.Field()
Drinks = scrapy.Field()
Has_children = scrapy.Field()
Wants_children = scrapy.Field()
Star_sign = scrapy.Field()
Education = scrapy.Field()
Personality = scrapy.Field()
Output:

Related

Unable to scrape Image URL(Scrapy)

I am trying to scrape data using Scrapy.
All Parts data are extracted except the Product Image URL.
When trying to extract the Image URL It returns a List of Empty Strings as Shown in the below Image
Project Code
menscloths.py (Spider)
import scrapy
from ..items import DataItem
class MensclothsSpider(scrapy.Spider):
name = 'menscloths'
next_page=2
start_urls = ['https://www.example.com/clothing-and-accessories/topwear/pr?sid=clo%2Cash&otracker=categorytree&p%5B%5D=facets.ideal_for%255B%255D%3DMen&page=1']
def parse(self, response):
items=DataItem()
products=response.css("div._1xHGtK")
for product in products:
name = product.css(".IRpwTa::text").extract()
brand = product.css("._2WkVRV::text").extract()
original_price = product.css("._3I9_wc::text").extract()[1]
sale_price = product.css("._30jeq3::text").extract()[0][1:]
image_url = product.css("._2r_T1I::attr('src')").extract()
product_page_url = "https://www.example.com"+product.css("._2UzuFa::attr('href')").extract()[0]
product_category = "men topwear"
items["name"]=name
items["brand"]=brand
items["original_price"]=original_price
items["sale_price"]=sale_price
items["image_url"]=image_url
items["product_page_url"]=product_page_url
items["product_category"]=product_category
yield items
item.py
import scrapy
class DataItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
brand = scrapy.Field()
original_price = scrapy.Field()
sale_price = scrapy.Field()
image_url = scrapy.Field()
product_page_url = scrapy.Field()
product_category = scrapy.Field()
setting.py
BOT_NAME = 'scraper'
SPIDER_MODULES = ['scraper.spiders']
NEWSPIDER_MODULE = 'scraper.spiders'
ITEM_PIPELINES = {
'scraper.pipelines.ScraperPipeline': 300,
}
Thank in advance
I've seen this happen multiple times before. If you look closely at the images when you load the page, you can see that the image appears after a bit of time (even though, at least for me, the time it takes to load is about 1 second). However, your code is just loading the page and then trying to get the images, not waiting for the images to load in. You need some sort of wait function in order to wait for the images to load, and then get the images.

Python - How do I format scrapy data in a csv file?

I am new to python and web scraping and I tried storing the scrapy data to a csv file however the output is not satisfactory.
Current csv output:
Title Image
Audi,Benz,BMW Image1,Image2,Image3
how i would like to view it in a csv file:
Title Image
Audi Image1
Benz Image2
BMW Image3
this is what is type in the terminal to run it:
scrapy crawl testscraper -t csv -o test.csv
Here's the spider.py:
class TestSpiderSpider(scrapy.Spider):
name = 'testscraper'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
items = scrapeItem()
product_title = response.css('.jco-card-title::text').extract()
product_imagelink = response.css('.card-image img::attr(data-src)').getall()
items['product_title'] = product_title
items['product_imagelink'] = product_imagelink
items.append('items')
yield items
He's the code for items.py:
class scrapeItem(scrapy.Item):
product_title = scrapy.Field()
product_imagelink = scrapy.Field()
pass
You can select every div element that contains a car and then iterate over those elements, yielding them one by one.
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
item = scrapeItem()
product_title = car.css('.jco-card-title::text').get()
product_imagelink = car.css('.card-image img::attr(data-src)').get()
# Some of the elements don't contain a title or a image_link, like ads for example.
if product_title and product_imagelink:
item['product_title'] = product_title.strip().replace('\n', '')
item['product_imagelink'] = product_imagelink
yield item

Create a static column item

I have a simple spider that crawls local obituaries. The code works perfectly until I try to add two static columns. All I want to do is add the date I pulled the information (pull item) and the state in which it was pulled (state item). It's a self loading page so when I add the pull date, I only get the first 10 results (or only the first page). If I add just the state, I only get two results. When I remove both, I get all 40+ results.
I did # lines that aren't working properly:
Item.py file:
import scrapy
class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
#pull = scrapy.Field()
#state = scrapy.Field()
spider file:
import scrapy
import time
from al.items import AlItem
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browse?type=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
#pull = time.strftime("%m/%d/%Y")
#state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
#new_item['pull'] = pull
#new_item["state"] = state
yield new_item
I explain why:
if you paste in here for item in zip(name, link, obit, news): pull & state, then you will get the number of iterations equal 2 because state = "AL" - string variable. ZIP function get from state two chars and set iteration = 2 for all arguments in loop. zip gets the smallest numb from arguments for iteration. as with the date 01/01/2001 - 10 chars. (will iterations equal 10)
WILL WORKING:
`class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
pull = scrapy.Field()
state = scrapy.Field()`
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browsetype=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
pull = time.strftime("%m/%d/%Y")
state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
new_item['pull'] = pull
new_item["state"] = state
yield new_item

NameError: name 'DmozItem' is not defined

Still getting to grips with Scapy and been following this tutorial. Having a little trouble following however as I am getting the following: NameError: name 'DmozItem' is not defined when I run this:
import scrapy
from scrapy import Item, Field
class QuotesItems(scrapy.Item):
area_name = scrapy.Field()
room_type = scrapy.Field()
period = scrapy.Field()
duration_weekly = scrapy.Field()
guide_total = scrapy.Field()
amenities = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "not_quotes"
start_urls = [
'http://www.unitestudents.com/',
]
# Step 1
def parse(self, response):
for city in response.xpath('//select[#id="frm_homeSelect_city"]/option[not(contains(text(),"Select your city"))]/text()').extract(): # Select all cities listed in the select (exclude the "Select your city" option)
yield scrapy.Request(response.urljoin("/"+city), callback=self.parse_citypage)
# Step 2
def parse_citypage(self, response):
for url in response.xpath('//div[#class="property-header"]/h3/span/a/#href').extract(): #Select for each property the url
yield scrapy.Request(response.urljoin(url), callback=self.parse_unitpage)
# Step 3
def parse_unitpage(self, response):
for final in response.xpath('//div/div/div[#class="content__btn"]/a/#href').extract(): #Select final page for data scrape
yield scrapy.Request(response.urljoin(final), callback=self.parse_final)
#Step 4
def parse(self, response):
for sel in response.xpath('//html/body/div'):
item = DmozItem()
item['area_name'] = sel.xpath('//div/ul/li/a/span/text()').extract()
item['room_type'] = sel.xpath('//div/div/div/h1/span/text()').extract()
item['period'] = sel.xpath('/html/body/div/div/section/div/form/h4/span/text()').extract()
item['duration_weekly'] = sel.xpath('//html/body/div/div/section/div/form/div/div/em/text()').extract()
item['guide_total'] = sel.xpath('//html/body/div/div/section/div/form/div/div/p/text()').extract()
item['amenities'] = sel.xpath('//div/div/div/ul/li/p/text()').extract()
yield item
I have set up my items.py file as:
class DmozItem(Item):
area_name = Field()
room_type = Field()
period = Field()
duration_weekly = Field()
guide_total = Field()
amenities = Field()
pass
Not really sure where I am going wrong on this one?
You should import DmozItem
from YourFolderName.items import DmozItem

Scrapy collect data from first element and post's title

I need Scrapy to collect data from this tag and retrieve all three parts in one piece. The output would be something like:
Tonka double shock boys bike - $10 (Denver).
<span class="postingtitletext">Tonka double shock boys bike - <span class="price">$10</span><small> (Denver)</small></span>
Second is to collect data from first span tag. So the result would be only:
2016 2004 Pontiac Grand Prix gt.
<p class="attrgroup"><span><b>2016 2004 Pontiac Grand Prix gt</b></span> <span>odometer: <b>164</b></span> <span>fuel : <b>gas</b></span> <span>transmission : <b>automatic</b></span> <span>title status : <b>clean</b></span></p>
Here is my code so far:
# -*- coding: utf-8 -*-
# scrapy crawl dmoz -o items.csv -t csv
import re
import scrapy
from scrapy.http import Request
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://jxn.craigslist.org/search/cto?"
]
BASE_URL = 'http://jxn.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/nos/vgm/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"]=response.xpath("//p[#class='attrgroup']/span/b/text()").extract()
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
For posting title, get all the text nodes from the span tag and join them:
$ scrapy shell http://denver.craigslist.org/bik/5042090428.html
In [1]: "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
Out[1]: u'Tonka double shock boys bike - $10 (Denver)'
Note that the "Scrapy-way" to do this would be to use an ItemLoader and the Join() processor.
Second is to collect data from first span tag.
Since you haven't provided an example input data, here is an educated guess:
response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0]

Categories

Resources