I am trying to crawl some selected domains and take only the essential pages from those websites. My approach is to crawl one webpage of the domain and take a limit set of urls, these urls will crawled for reoccurring URLs that i found on the first webpage. This way i try to eliminate all the URLs that didn't reoccur (content urls, such as products etc.). The reason i am asking for help is because scrapy.Request is not being executed more than once.
This is what i have so far:
class Finder(scrapy.Spider):
name = "finder"
start_urls = ['http://www.nu.nl/']
uniqueDomainUrl = dict()
maximumReoccurringPages = 5
rules = (
Rule(
LinkExtractor(
allow=('.nl', '.nu', '.info', '.net', '.com', '.org', '.info'),
deny=('facebook','amazon', 'wordpress', 'blogspot', 'free', 'reddit',
'videos', 'youtube', 'google', 'doubleclick', 'microsoft', 'yahoo',
'bing', 'znet', 'stackexchang', 'twitter', 'wikipedia', 'creativecommons',
'mediawiki', 'wikidata'),
),
process_request='parse',
follow=True
),
)
def parse(self, response):
self.logger.info('Entering URL: %s', response.url)
currentUrlParse = urlparse.urlparse( response.url )
currentDomain = currentUrlParse.hostname
if currentDomain in self.uniqueDomainUrl:
yield
self.uniqueDomainUrl[currentDomain] = currentDomain
item = ImportUrlList()
response.meta['item'] = item
# Reoccurring URLs
item = self.findReoccurringUrls(response)
list = item['list']
self.logger.info('Output: %s', list)
# Crawl reoccurring urls
#for href in list:
# yield scrapy.Request(response.urljoin(href), callback=self.parse)
def findReoccurringUrls(self, response):
self.logger.info('Finding reoccurring URLs in: %s', response.url)
item = response.meta['item']
urls = self.findUrlsOnCurrentPage(response)
item['list'] = urls
response.meta['item'] = item
# Get all URLs on each web page (limit 5 pages)
i = 0
for value in urls:
i += 1
if i > self.maximumReoccurringPages:
break
self.logger.info('Parse: %s', value)
request = Request(value, callback=self.test, meta={'item':item})
item = request.meta['item']
return item
def test(self, response):
self.logger.info('Page title: %s', response.css('title').extract())
item = response.meta['item']
urls = self.findUrlsOnCurrentPage( response )
item['list'] = set(item['list']) & set(urls)
return item
def findUrlsOnCurrentPage(self, response):
newUrls = []
currentUrlParse = urlparse.urlparse( response.url )
currentDomain = currentUrlParse.hostname
currentUrl = currentUrlParse.scheme +'://'+ currentUrlParse.hostname
for href in response.css('a::attr(href)').extract():
newUrl = urlparse.urljoin(currentUrl, href)
urlParse = urlparse.urlparse(newUrl)
domain = urlParse.hostname
if href.startswith( '#' ):
continue
if domain != currentDomain:
continue
if newUrl not in newUrls:
newUrls.append(newUrl)
return newUrls
It seems to be only executing the first page, the other Request() are not called as i can see on the callback.
What ImportUrlList() does? You implemented it?
You also forgot to call scrapy.Request on findReoccuringUrls
request = scrapy.Request(value, callback=self.test, meta={'item':item})
def findReoccurringUrls(self, response):
self.logger.info('Finding reoccurring URLs in: %s', response.url)
item = response.meta['item']
urls = self.findUrlsOnCurrentPage(response)
item['list'] = urls
response.meta['item'] = item
# Get all URLs on each web page (limit 5 pages)
i = 0
for value in urls:
i += 1
if i > self.maximumReoccurringPages:
break
self.logger.info('Parse: %s', value)
request = scrapy.Request(value, callback=self.test, meta={'item':item})
item = request.meta['item']
Related
I'm scraping the portal using Scrapy, Here, I need to get detailed meta data about every product.
So, to get this information - initially I have scraped all the products in given page and looping through each item to get meta data. But All the details are duplicated in the CSV file.
Here is the sample code:
class MyDinSpider(scrapy.Spider):
name = 'abc'
allowed_domains = ['www.abc.com.my']
start_urls = ['http://https://www.abc/online-store/']
def start_requests(self):
with open("./csvFiles/urls.csv", "rU") as f:
reader=csv.DictReader(f)
for row in reader:
url=row['url']
link_urls = [url.format(i) for i in range(2)]
for link_url in link_urls:
print(link_url)
request=Request(link_url, callback=self.parse_product_pages, meta={'Category': row['Category']})
yield request
def parse_product_pages(self,response):
item=MydinscrapingItem()
content=response.css('.variants-pop-over').extract()
for product_content in content:
val = product_content.split('default value=')[1].split('>')[0]
link = "https://abc/products/detail/?pid={}".format(val.split('"ProductExtId": "')[1].split('"')[0])
item['Date'] = datetime.today().date()
item['Title'] = val.split('"ProductName": "')[1].split('"')[0]
item['Price'] = val.split('"price":"')[1].split('"')[0]
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
def parse_page2(self, response):
item = response.meta['item']
item['Category'] = category[:-2]
yield (item)
def parse(self, response):
pass
Here is the output csv file
I am building a very simple scraper but there is a very silly mistake i am doing somewhere which i am not able to find.
In response method, I am getting the same response for any URL passed using loop of all the products on the product list page
I am adding my code below please help.
def parse(self, response):
item = {}
count = 0
for single in response.xpath('//div[#class="_3O0U0u"]/div'):
count+=1
# print(count)
item['data_id'] = single.xpath('.//#data-id').extract_first()
item['price'] = single.xpath('.//div[#class="_1vC4OE"]/text()').extract_first()
item['url'] = single.xpath('.//div[#class="_1UoZlX"]/a[#class="_31qSD5"]/#href').extract_first()
if not item['url']:
item['url'] = single.xpath('.//div[#class="_3liAhj _1R0K0g"]/a[#class="Zhf2z-"]/#href').extract_first()
#print(item)
if item['url']:
yield scrapy.Request('https://www.somewebsite.com' + item['url'], callback = self.get_product_detail, priority = 1, meta={'item': item})
# break
next_page = response.xpath('//div[#class="_2zg3yZ"]/nav/a[#class="_3fVaIS"]/span[contains(text(),"Next")]/parent::a/#href').extract_first()
if next_page:
next_page = 'https://www.somewebsite.com'+response.xpath('//div[#class="_2zg3yZ"]/nav/a[#class="_3fVaIS"]/span[contains(text(),"Next")]/parent::a/#href').extract_first()
yield scrapy.Request(next_page, callback=self.parse ,priority=1)
def get_product_detail(self, response):
dict_item = response.meta['item']
sku = dict_item['data_id']
print('dict SKU ======== ', sku)
Let's imagine I have a webpage like this.
counter.php
if(isset($_GET['count'])){
$count = intval($_GET['count']);
$previous = $count - 1;
$next = $count + 1;
?>
< Previous
Current: <?php echo $count;?>
Next >
<?
}
?>
This is an "infinite" website because you can just keep clicking next to go to the next page (the counter will just increase) or previous etc.
However, if I wanted to crawl this page and follow the links using scrapy like this, scrapy will never stop crawling.
Example spider:
urls = []
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
start_urls = ['http://example.com/counter?count=1']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
urls.append(response.url)
What kind of mechanism can I use to determine if indeed I am stuck in an infinite website and need to break out of it?
You can always break out if the page does not have ITEMS on that page, or do not have NEXT PAGE button, that means pagination has ended
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
def start_requests(self):
page = 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
def parse_item(self, response):
#METHOD 1: check if items availble on this page
items = response.css("li.items")
if items:
#Now go to next page
page = int(response.meta['page']) + 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
else:
logging.info("%s was last page" % response.url)
#METHOD 2: check if this page has NEXT PAGE button, most websites has that
nextPage = response.css("a.nextpage")
if nextPage:
#Now go to next page
page = int(response.meta['page']) + 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
else:
logging.info("%s was last page" % response.url)
You don't have to use Rule in the scrapy. You can first parse the page by page and then iterates all items in each page. Or you can collect all item links in the each page.
For example:
urls = []
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
start_urls = ['http://example.com/counter?count=1']
def parse(self, response):
links = response.xpath('//a[#class="item"]/#href').extract()
for link in links:
yield Request(link, self.parse_item)
# you can insert the item 's url here, so you dont have to yield to parse_item
# urls.append(link)
url, pg = response.url.split("=")# you can break infinite loop here
if int(pg) <= 10: #We loop by page #10
yield Request(url + "=" + str(int(pg) + 1), self.parse)
def parse_item(self, response):
urls.append(response.url)
In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.
This is my code
def parse(self, response):
soup = BeautifulSoup(response.body)
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="row"]')
items = []
for site in sites[:5]:
item = TestItem()
item['username'] = "test5"
request = Request("http://www.example.org/profile.php", callback = self.parseUserProfile)
request.meta['item'] = item
**yield item**
mylinks= soup.find_all("a", text="Next")
if mylinks:
nextlink = mylinks[0].get('href')
yield Request(urljoin(response.url, nextlink), callback=self.parse)
def parseUserProfile(self, response):
item = response.meta['item']
item['image_urls'] = "test3"
return item
Now my above works but with that i am not getting value of item['image_urls'] = "test3"
It is coming as null
Now if use return request instead of yield item
Then get error that cannot use return with generator
If i remove this line
yield Request(urljoin(response.url, nextlink), callback=self.parse)
Then my code works fine and i can get image_urls but then i canot follow the links
So is there any way so that i can use return request and yield together so that i get the item_urls
I don't really understand your issue, but i see one problem in your code:
def parseUserProfile(self, response):
item = response.meta['item']
item['image_urls'] = "test3"
return item
Parse callbacks return values should be sequences, so you should do return [item] or convert your callback into a generator:
def parseUserProfile(self, response):
item = response.meta['item']
item['image_urls'] = "test3"
yield item
Looks like you have a mechanical error. Instead of:
for site in sites[:5]:
item = TestItem()
item['username'] = "test5"
request = Request("http://www.example.org/profile.php", callback = self.parseUserProfile)
request.meta['item'] = item
**yield item**
You need:
for site in sites[:5]:
item = TestItem()
item['username'] = "test5"
request = Request("http://www.example.org/profile.php", callback = self.parseUserProfile)
request.meta['item'] = item
yield request