How to get data from subsequent urls with scrappy - python

I want to fetch data from this website: http://www.go-on.fi/tyopaikat with Scrappy.
I am able to fetch the data from this page which have href move to another page when you click on the first column title.
My question is how can I lead the spider to go deeper to for example this link: http://www.go-on.fi/tyopaikat/oulu/11414 and get information from there.
so this is my code at this moment:
class JobDataSpider(CrawlSpider):
name = "jobdata"
allowed_domains = ["go-on.fi"]
start_urls = ["http://www.go-on.fi/tyopaikat?start=0",
"http://www.go-on.fi/tyopaikat?start=20",
"http://www.go-on.fi/tyopaikat?start=40",
"http://www.go-on.fi/tyopaikat?start=60"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
jobs = hxs.select("//tr")
for row in jobs:
item = JobData()
item['title'] = row.select("./td[1]/a/text()").extract()
item['link'] = row.select("./td[1]/a/#href").extract()
item['location'] = row.select("./td[2]/text()").extract()

You need to yield Request(url) for each of the URl's found on the first page. Something like this should work:
class JobDataSpider(CrawlSpider):
name = "jobdata"
allowed_domains = ["go-on.fi"]
start_urls = ["http://www.go-on.fi/tyopaikat?start=0",
"http://www.go-on.fi/tyopaikat?start=20",
"http://www.go-on.fi/tyopaikat?start=40",
"http://www.go-on.fi/tyopaikat?start=60"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.xpath("//tr/td[1]/a/#href").extract()
for l in links:
yield Request(l, callback=self.parse_pages)
def parse_pages(self, response):
hxs = HtmlXPathSelector(response)
item = JobData()
item['link'] = response.url
etc....

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.
Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

Scrapy : unable to scrape nested links properly - all the records are duplicating

I'm scraping the portal using Scrapy, Here, I need to get detailed meta data about every product.
So, to get this information - initially I have scraped all the products in given page and looping through each item to get meta data. But All the details are duplicated in the CSV file.
Here is the sample code:
class MyDinSpider(scrapy.Spider):
name = 'abc'
allowed_domains = ['www.abc.com.my']
start_urls = ['http://https://www.abc/online-store/']
def start_requests(self):
with open("./csvFiles/urls.csv", "rU") as f:
reader=csv.DictReader(f)
for row in reader:
url=row['url']
link_urls = [url.format(i) for i in range(2)]
for link_url in link_urls:
print(link_url)
request=Request(link_url, callback=self.parse_product_pages, meta={'Category': row['Category']})
yield request
def parse_product_pages(self,response):
item=MydinscrapingItem()
content=response.css('.variants-pop-over').extract()
for product_content in content:
val = product_content.split('default value=')[1].split('>')[0]
link = "https://abc/products/detail/?pid={}".format(val.split('"ProductExtId": "')[1].split('"')[0])
item['Date'] = datetime.today().date()
item['Title'] = val.split('"ProductName": "')[1].split('"')[0]
item['Price'] = val.split('"price":"')[1].split('"')[0]
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
def parse_page2(self, response):
item = response.meta['item']
item['Category'] = category[:-2]
yield (item)
def parse(self, response):
pass
Here is the output csv file

Scrapy Follow scraped links to get more Data

I want to follow the links that I've scraped to get more details.
For example, from here, which contains all the job titles.
I wish to go to one of the links, for example, here to extract the job descriptions.
Below is my working code for getting the Title Link and Date as well as getting them to insert into a CSV File.
class MySpider(BaseSpider):
name = "craigslist"
allowed_domains = ["singapore.craigslist.com.sg"]
start_urls = ["https://singapore.craigslist.com.sg/d/jobs/search/jjj"]
def parse(self, response):
item = SampleItem()
item["title"] = response.xpath('//*[#class="result-info"]/a/text()').extract()
item["link"] = response.xpath('//*[#class="result-info"]/a/#href').extract()
item["date"] = response.xpath('//*[#class="result-info"]/time[#class="result-date"]/text()').extract()
for i in range(len(item["title"])):
yield {"Title": item['title'][i], "Link": item['link'][i], "Date": item['date'][i]}
This is my attempt to go to the link but it hasn't been successful.
class MySpider(BaseSpider):
name = "craigslist"
allowed_domains = ["singapore.craigslist.com.sg"]
start_urls = ["https://singapore.craigslist.com.sg/d/jobs/search/jjj"]
BASE_URL = 'https://singapore.craigslist.com.sg'
def parse(self, response):
links = response.xpath('//*[#class="result-info"]/a/#href').extract()
item = SampleItem()
item["title"] = response.xpath('//*[#class="result-info"]/a/text()').extract()
item["date"] = response.xpath('//*[#class="result-info"]/time[#class="result-date"]/text()').extract()
for i in range(len(item["title"])):
yield {"Title": item['title'][i], "Date": item['date'][i]}
for link in links:
absolute_url = self.BASE_URL + link
yield BaseSpider.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
item = SampleItem()
item["description"] = response.xpath('//*[#id="postingbody"]/text()').extract()
for i in range(len(item["description"])):
yield {"Description" : item["description"]}
Any idea how to do this?
Log of scraper

whats wrong with this scrapy spider? scrapes only last url

In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.

How to use Request function in a Scrapy Spider?

from string import join
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders.crawl import Rule, CrawlSpider
from scrapy.http.request import Request
from scrapy.selector import HtmlXPathSelector
from Gfire.items import GfireItem
class GuideSpider(CrawlSpider):
name = "Gfire"
allowed_domains = ['www.example.com']
start_urls = [
"http://www.example.com/gfire/guides"
]
rules = (
Rule(SgmlLinkExtractor(allow=("gfire/guides.*page=")), callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[#class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/#href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1], callback=self.parse_item2)
def parse_item2(self, response):
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero
Can't get this spider to work. The request function contains items[1] that should be item['guide_url'] but it says me that the parameter has to be str or unicode.
How can I corret this error? And how can I pass to the callback function the items list? Via request.meta?
Your item[1] is actually an instance of GFireItem.
I'm not certain why you are creating these as you only use one (the second site in your list of sites), discarding the rest of the list.
That aside, you need to extract the items[1]['guide_url'] url when creating the Request:
return Request(items[1]['guide_url'], callback=self.parse_item2)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[#class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/#href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1]['guide_url'], request.meta={'items':items}, callback=self.parse_item2)
def parse_item2(self, response):
items = response.meta["items"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero

Categories

Resources