Scrapy Follow scraped links to get more Data

Scrapy Follow scraped links to get more Data - python

I want to follow the links that I've scraped to get more details.
For example, from here, which contains all the job titles.
I wish to go to one of the links, for example, here to extract the job descriptions.
Below is my working code for getting the Title Link and Date as well as getting them to insert into a CSV File.
class MySpider(BaseSpider):
name = "craigslist"
allowed_domains = ["singapore.craigslist.com.sg"]
start_urls = ["https://singapore.craigslist.com.sg/d/jobs/search/jjj"]
def parse(self, response):
item = SampleItem()
item["title"] = response.xpath('//*[#class="result-info"]/a/text()').extract()
item["link"] = response.xpath('//*[#class="result-info"]/a/#href').extract()
item["date"] = response.xpath('//*[#class="result-info"]/time[#class="result-date"]/text()').extract()
for i in range(len(item["title"])):
yield {"Title": item['title'][i], "Link": item['link'][i], "Date": item['date'][i]}
This is my attempt to go to the link but it hasn't been successful.
class MySpider(BaseSpider):
name = "craigslist"
allowed_domains = ["singapore.craigslist.com.sg"]
start_urls = ["https://singapore.craigslist.com.sg/d/jobs/search/jjj"]
BASE_URL = 'https://singapore.craigslist.com.sg'
def parse(self, response):
links = response.xpath('//*[#class="result-info"]/a/#href').extract()
item = SampleItem()
item["title"] = response.xpath('//*[#class="result-info"]/a/text()').extract()
item["date"] = response.xpath('//*[#class="result-info"]/time[#class="result-date"]/text()').extract()
for i in range(len(item["title"])):
yield {"Title": item['title'][i], "Date": item['date'][i]}
for link in links:
absolute_url = self.BASE_URL + link
yield BaseSpider.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
item = SampleItem()
item["description"] = response.xpath('//*[#id="postingbody"]/text()').extract()
for i in range(len(item["description"])):
yield {"Description" : item["description"]}
Any idea how to do this?
Log of scraper

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.

Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

Scrapy : unable to scrape nested links properly - all the records are duplicating

I'm scraping the portal using Scrapy, Here, I need to get detailed meta data about every product.
So, to get this information - initially I have scraped all the products in given page and looping through each item to get meta data. But All the details are duplicated in the CSV file.
Here is the sample code:
class MyDinSpider(scrapy.Spider):
name = 'abc'
allowed_domains = ['www.abc.com.my']
start_urls = ['http://https://www.abc/online-store/']
def start_requests(self):
with open("./csvFiles/urls.csv", "rU") as f:
reader=csv.DictReader(f)
for row in reader:
url=row['url']
link_urls = [url.format(i) for i in range(2)]
for link_url in link_urls:
print(link_url)
request=Request(link_url, callback=self.parse_product_pages, meta={'Category': row['Category']})
yield request
def parse_product_pages(self,response):
item=MydinscrapingItem()
content=response.css('.variants-pop-over').extract()
for product_content in content:
val = product_content.split('default value=')[1].split('>')[0]
link = "https://abc/products/detail/?pid={}".format(val.split('"ProductExtId": "')[1].split('"')[0])
item['Date'] = datetime.today().date()
item['Title'] = val.split('"ProductName": "')[1].split('"')[0]
item['Price'] = val.split('"price":"')[1].split('"')[0]
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
def parse_page2(self, response):
item = response.meta['item']
item['Category'] = category[:-2]
yield (item)
def parse(self, response):
pass
Here is the output csv file

How to pass arguments between two spiders with scrapy callback

I have two scrapy that the first one crawl a sitemap and extract urls and put it in a txt file and the second one reads it and crawl this urls line by line.
my code like bellow :
class sitemapSpider(SitemapSpider):
name = "filmnetmapSpider"
sitemap_urls = ['http://filmnet.ir/sitemap.xml']
sitemap_rules = [
('/series/', 'parse_item')
]
storage_file = 'urls.txt'
def parse_item(self, response):
videoid = response.url
with open(self.storage_file, 'a') as handle:
yield handle.writelines(videoid + '\n')
second spider :
class filmnetSpider(scrapy.Spider):
name = 'filmnetSpider'
def start_requests(self):
with open('urls.txt') as fp:
for line in fp:
yield Request(line.strip(), callback=self.parse_website)
def parse_website(self, response):
hxs = HtmlXPathSelector(response)
url = hxs.xpath('//script[#type="application/ld+json"]/text()').extract()
url = ast.literal_eval(json.dumps(url))
url = url[1]
obj = json.loads(url)
poster = obj['image']
name = obj['name']
description = obj['description']
How to change the code to delete read/write to the file?
How to use callback in it?
Note : This code does not work in one scrapy spider ;code is :Two given scrapy + bellow code ,As an example is said in doc
process = CrawlerProcess()
process.crawl(filmnetSpider)
process.crawl(sitemapSpider)
process.start()

This should work:
class sitemapSpider(SitemapSpider):
name = "filmnetmapSpider"
sitemap_urls = ['http://filmnet.ir/sitemap.xml']
sitemap_rules = [
('/series/', 'parse_item')
]
def parse_item(self, response):
videoid = response.url
yield Request(videoid, callback=self.parse_website)
def parse_website(self, response):
hxs = HtmlXPathSelector(response)
url = hxs.xpath('//script[#type="application/ld+json"]/text()').extract()
url = ast.literal_eval(json.dumps(url))
url = url[1]
obj = json.loads(url)
poster = obj['image']
name = obj['name']
description = obj['description']

whats wrong with this scrapy spider? scrapes only last url

In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item

By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.

How to get data from subsequent urls with scrappy

I want to fetch data from this website: http://www.go-on.fi/tyopaikat with Scrappy.
I am able to fetch the data from this page which have href move to another page when you click on the first column title.
My question is how can I lead the spider to go deeper to for example this link: http://www.go-on.fi/tyopaikat/oulu/11414 and get information from there.
so this is my code at this moment:
class JobDataSpider(CrawlSpider):
name = "jobdata"
allowed_domains = ["go-on.fi"]
start_urls = ["http://www.go-on.fi/tyopaikat?start=0",
"http://www.go-on.fi/tyopaikat?start=20",
"http://www.go-on.fi/tyopaikat?start=40",
"http://www.go-on.fi/tyopaikat?start=60"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
jobs = hxs.select("//tr")
for row in jobs:
item = JobData()
item['title'] = row.select("./td[1]/a/text()").extract()
item['link'] = row.select("./td[1]/a/#href").extract()
item['location'] = row.select("./td[2]/text()").extract()

You need to yield Request(url) for each of the URl's found on the first page. Something like this should work:
class JobDataSpider(CrawlSpider):
name = "jobdata"
allowed_domains = ["go-on.fi"]
start_urls = ["http://www.go-on.fi/tyopaikat?start=0",
"http://www.go-on.fi/tyopaikat?start=20",
"http://www.go-on.fi/tyopaikat?start=40",
"http://www.go-on.fi/tyopaikat?start=60"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.xpath("//tr/td[1]/a/#href").extract()
for l in links:
yield Request(l, callback=self.parse_pages)
def parse_pages(self, response):
hxs = HtmlXPathSelector(response)
item = JobData()
item['link'] = response.url
etc....

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy Follow scraped links to get more Data - python

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

Scrapy : unable to scrape nested links properly - all the records are duplicating

How to pass arguments between two spiders with scrapy callback

whats wrong with this scrapy spider? scrapes only last url

How to get data from subsequent urls with scrappy

Categories

Resources