I am trying to scrape data using Scrapy.
All Parts data are extracted except the Product Image URL.
When trying to extract the Image URL It returns a List of Empty Strings as Shown in the below Image
Project Code
menscloths.py (Spider)
import scrapy
from ..items import DataItem
class MensclothsSpider(scrapy.Spider):
name = 'menscloths'
next_page=2
start_urls = ['https://www.example.com/clothing-and-accessories/topwear/pr?sid=clo%2Cash&otracker=categorytree&p%5B%5D=facets.ideal_for%255B%255D%3DMen&page=1']
def parse(self, response):
items=DataItem()
products=response.css("div._1xHGtK")
for product in products:
name = product.css(".IRpwTa::text").extract()
brand = product.css("._2WkVRV::text").extract()
original_price = product.css("._3I9_wc::text").extract()[1]
sale_price = product.css("._30jeq3::text").extract()[0][1:]
image_url = product.css("._2r_T1I::attr('src')").extract()
product_page_url = "https://www.example.com"+product.css("._2UzuFa::attr('href')").extract()[0]
product_category = "men topwear"
items["name"]=name
items["brand"]=brand
items["original_price"]=original_price
items["sale_price"]=sale_price
items["image_url"]=image_url
items["product_page_url"]=product_page_url
items["product_category"]=product_category
yield items
item.py
import scrapy
class DataItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
brand = scrapy.Field()
original_price = scrapy.Field()
sale_price = scrapy.Field()
image_url = scrapy.Field()
product_page_url = scrapy.Field()
product_category = scrapy.Field()
setting.py
BOT_NAME = 'scraper'
SPIDER_MODULES = ['scraper.spiders']
NEWSPIDER_MODULE = 'scraper.spiders'
ITEM_PIPELINES = {
'scraper.pipelines.ScraperPipeline': 300,
}
Thank in advance
I've seen this happen multiple times before. If you look closely at the images when you load the page, you can see that the image appears after a bit of time (even though, at least for me, the time it takes to load is about 1 second). However, your code is just loading the page and then trying to get the images, not waiting for the images to load in. You need some sort of wait function in order to wait for the images to load, and then get the images.
Related
I am new to python and web scraping and I tried storing the scrapy data to a csv file however the output is not satisfactory.
Current csv output:
Title Image
Audi,Benz,BMW Image1,Image2,Image3
how i would like to view it in a csv file:
Title Image
Audi Image1
Benz Image2
BMW Image3
this is what is type in the terminal to run it:
scrapy crawl testscraper -t csv -o test.csv
Here's the spider.py:
class TestSpiderSpider(scrapy.Spider):
name = 'testscraper'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
items = scrapeItem()
product_title = response.css('.jco-card-title::text').extract()
product_imagelink = response.css('.card-image img::attr(data-src)').getall()
items['product_title'] = product_title
items['product_imagelink'] = product_imagelink
items.append('items')
yield items
He's the code for items.py:
class scrapeItem(scrapy.Item):
product_title = scrapy.Field()
product_imagelink = scrapy.Field()
pass
You can select every div element that contains a car and then iterate over those elements, yielding them one by one.
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
item = scrapeItem()
product_title = car.css('.jco-card-title::text').get()
product_imagelink = car.css('.card-image img::attr(data-src)').get()
# Some of the elements don't contain a title or a image_link, like ads for example.
if product_title and product_imagelink:
item['product_title'] = product_title.strip().replace('\n', '')
item['product_imagelink'] = product_imagelink
yield item
I would like to scrape data points highlighted in the picture (hotel names, location, reviews, ratings, and prices) but my spider is not returning anything (most likely due to wrong selectors). The URL to the website is here:
https://www.expedia.com/Hotel-Search?destination=Vienna®ionId=178316&startDate=2020-09-25&endDate=2020-09-26&d1=2020-09-25&d2=2020-09-26&rooms=1&adults=2
Here is my spider code:
class ExpediaSpider(scrapy.Spider):
name = 'expedia'
# allowed_domains = ['expedia.com']
start_urls = [all_urls[0]]
def parse(self, response):
items = ExpediaScraperItem()
html = response.css('.uitk-card-link')
for qoutes in html:
review = qoutes.css('div.listing__reviews all-t-margin-two').css('::text').extract()
price = qoutes.css('span.uitk-cell loyalty-display-price all-cell-shrink').css('::text').extract()
hotel_name = qoutes.css('truncate-lines-2 all-b-padding-half pwa-theme--grey-900 uitk-type-heading-500').css('::text').extract()
location = qoutes.css('overflow-wrap uitk-spacing uitk-spacing-padding-blockend-two uitk-text-secondary-theme').css('::text').extract()
# then save it
items['review'] = review # eqauls to var extracted
items['price'] = price
items['hotel_name'] = hotel_name
items['location'] = location
yield items
I also tried directly listing the selectors without a loop but I am striking out. If anyone has some time and could explain to me some CSS/XPath tricks to this HTML "blob" that would be awesome. Thank you for taking the time to read this.
This will work:
hotels = response.xpath('//li[#data-stid="property-listing"]')
for hotel in hotels:
review = hotel.xpath('string(.//div[#data-stid="content-hotel-review-info"]/span/span[1])').get()
price = hotel.xpath('.//span[#data-stid="price-lockup-text"]/text()').get()
hotel_name = hotel.xpath('.//h3[#data-stid="content-hotel-title"]/text()').get()
location = hotel.xpath('.//div[#data-test-id="content-hotel-neighborhood"]/text()').get()
# then save it
items['review'] = review # eqauls to var extracted
items['price'] = price
items['hotel_name'] = hotel_name
items['location'] = location
yield items
I am using Python Scrapy to scrape a website I am sorry if this is a noob question because I am new in this I know we could split the data with .split() but I dont know where to use it.
Here is my spider
import scrapy
from scrapy.loader import ItemLoader
from ..items import FreelancerItem
class MatchSpider(scrapy.Spider):
name = 'match'
start_urls = ['http://www.totalcorner.com/match/schedule/20191215/']
def parse(self, response):
items = FreelancerItem()
all_table_cols = response.xpath("//tbody[#class='tbody_match']")
for col in all_table_cols:
League= col.css(".td_league a::text").extract()
Time= col.css(".match_status_minutes::text").extract()
Match_status = col.xpath("//span[#class='match_status_minutes']/text").extract()
Yellow_home = col.css(".yellow_card::text").extract()
Red_home = col.css(".match_home .red_card::text").extract()
Home = col.css(".match_home a span::text").extract()
Score=col.css(".match_goal::text").extract()
Away = col.css(".match_away a span::text").extract()
Yellow_away = col.css(".match_away .yellow_card::text").extract()
Red_away = col.css(".match_away .red_card::text").extract()
Handicap = col.css(".match_handicap::text").extract()
Corner= col.css(".match_corner::text").extract()
Goal_Line=col.css(".total_goals div::text").extract()
Tips=col.css(".newlabel::text").extract()
Dangerous_attack= col.css(".match_attach div::text").extract()
Shots=col.css(".match_shoot div::text").extract()
items["League"] =League
items["Time"] = Time
items["Match_status"] = Match_status
items["Yellow_home"] = Yellow_home
items["Red_home"] = Red_home
items["Home"] = Home
items["Score"] = Score
items["Away"] = Away
items["Yellow_away"] = Yellow_away
items["Red_away"] = Red_away
items["Handicap"] = Handicap
items["Corner"] = Corner
items["Goal_Line"] = Goal_Line
items["Tips"] = Tips
items["Dangerous_attack"] = Dangerous_attack
items["Shots"] = Shots
yield items
And this is my items.py
import scrapy
class FreelancerItem(scrapy.Item):
League=scrapy.Field()
Time = scrapy.Field()
Match_status = scrapy.Field()
Yellow_home = scrapy.Field()
Red_home = scrapy.Field()
Home = scrapy.Field()
Score = scrapy.Field()
Away = scrapy.Field()
Yellow_away = scrapy.Field()
Red_away = scrapy.Field()
Handicap = scrapy.Field()
Corner = scrapy.Field()
Goal_Line = scrapy.Field()
Tips = scrapy.Field()
Dangerous_attack = scrapy.Field()
Shots = scrapy.Field()
Also I want to arrange it according to the titles but in the csv output it arranges titles alphabetically.
Please help me learn
Thanks.
I have a simple spider that crawls local obituaries. The code works perfectly until I try to add two static columns. All I want to do is add the date I pulled the information (pull item) and the state in which it was pulled (state item). It's a self loading page so when I add the pull date, I only get the first 10 results (or only the first page). If I add just the state, I only get two results. When I remove both, I get all 40+ results.
I did # lines that aren't working properly:
Item.py file:
import scrapy
class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
#pull = scrapy.Field()
#state = scrapy.Field()
spider file:
import scrapy
import time
from al.items import AlItem
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browse?type=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
#pull = time.strftime("%m/%d/%Y")
#state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
#new_item['pull'] = pull
#new_item["state"] = state
yield new_item
I explain why:
if you paste in here for item in zip(name, link, obit, news): pull & state, then you will get the number of iterations equal 2 because state = "AL" - string variable. ZIP function get from state two chars and set iteration = 2 for all arguments in loop. zip gets the smallest numb from arguments for iteration. as with the date 01/01/2001 - 10 chars. (will iterations equal 10)
WILL WORKING:
`class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
pull = scrapy.Field()
state = scrapy.Field()`
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browsetype=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
pull = time.strftime("%m/%d/%Y")
state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
new_item['pull'] = pull
new_item["state"] = state
yield new_item
I'm attempting to scrape a website to get a very rough demographic of it's users (no personally identifying information or photos), but the tutorial spider from the official documentation I've modified is repeating the same line of output 4 times in a row.
A copy of the code I'm using is below:
Note that the example profile I've included in the code is a fake/spam account. In the case where it may have already been deleted, you can replace the url with any other on the site and it will work again.
import scrapy
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
for container in response.xpath('//div[#class="user-details-wide"]'):
yield {
'Gender': response.xpath("//span[#id='gender']/text()").extract_first(),
'Age': response.xpath("//span[#id='age']/text()").extract_first(),
'State': response.xpath("//span[#id='state_id']/text()").extract_first(),
'Marital status': response.xpath("//span[#id='maritalstatus']/text()").extract_first(),
'Body': response.xpath("//span[#id='body']/text()").extract_first(),
'Height': response.xpath("//span[#id='height']/text()").extract_first(),
'Ethnicity': response.xpath("//span[#id='ethnicity']/text()").extract_first(),
'Does drugs?': response.xpath("//span[#id='drugs']/text()").extract_first(),
'Smokes?': response.xpath("//span[#id='smoke']/text()").extract_first(),
'Drinks?': response.xpath("//span[#id='drink']/text()").extract_first(),
'Has children?': response.xpath("//span[#id='haschildren']/text()").extract_first(),
'Wants children?': response.xpath("//span[#id='wantchildren']/text()").extract_first(),
'Star sign': response.xpath("//span[#id='zodiac']/text()").extract_first(),
'Education': response.xpath("//span[#id='college_id']/text()").extract_first(),
'Personality': response.xpath("//span[#id='fishtype']/text()").extract_first(),
}
Running as follows:
scrapy crawl date -o date.scv
The output I'm looking for is one row of headers followed by one line of results straight after it, not the whitespace and duplicates I'm currently getting.
You don't need to use for loop. Simply find a span element and extract all data from him.
Also, I suggest you use scrapy items it's more convenient.
One way to clean extracted data from whitespace is to use xpath function normalize-space().
import scrapy
from items import DateSpiderItem
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
item = DateSpiderItem()
item['Gender'] = response.xpath(
"//span[#id='gender']/text()").extract_first()
item['Age'] = response.xpath(
"//span[#id='age']/text()").extract_first()
item['State'] = response.xpath(
"//span[#id='state_id']/text()").extract_first()
item['Marital_status'] = response.xpath(
"normalize-space(//span[#id='maritalstatus']/text())").extract_first()
item['Body'] = response.xpath(
"//span[#id='body']/text()").extract_first()
item['Height'] = response.xpath(
"//span[#id='height']/text()").extract_first()
item['Ethnicity'] = response.xpath(
"//span[#id='ethnicity']/text()").extract_first()
item['Does_drugs'] = response.xpath(
"normalize-space(//span[#id='drugs']/text())").extract_first()
item['Smokes'] = response.xpath(
"//span[#id='smoke']/text()").extract_first()
item['Drinks'] = response.xpath(
"normalize-space(//span[#id='drink']/text())").extract_first()
item['Has_children'] = response.xpath(
"normalize-space(//span[#id='haschildren']/text())").extract_first()
item['Wants_children'] = response.xpath(
"normalize-space(//span[#id='wantchildren']/text())").extract_first()
item['Star_sign'] = response.xpath(
"//span[#id='zodiac']/text()").extract_first()
yield item
Items file:
class DateSpiderItem(scrapy.Item):
Gender = scrapy.Field()
Age = scrapy.Field()
State = scrapy.Field()
Marital_status = scrapy.Field()
Body = scrapy.Field()
Height = scrapy.Field()
Ethnicity = scrapy.Field()
Does_drugs = scrapy.Field()
Smokes = scrapy.Field()
Drinks = scrapy.Field()
Has_children = scrapy.Field()
Wants_children = scrapy.Field()
Star_sign = scrapy.Field()
Education = scrapy.Field()
Personality = scrapy.Field()
Output: