Remove unnecessary url from scrapy - python

import scrapy
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
print(url)
I want to remove these unnecessary url from the link the website is https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
http://www.unbr.ro
http://www.inppa.ro
http://www.uniuneanotarilor.ro/
http://www.caav.ro
http://www.executori.ro/
http://www.csm1909.ro
http://www.inm-lex.ro
http://www.just.ro

You can apply endswith method along with continue keyword to remove the desired urls
import scrapy
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
print(url)
Output:
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=1091&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159077&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159076&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159075&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159021&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159020&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159019&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159018&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=21846&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=165927&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=83465&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=47724&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=32097&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=29573&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=19880&Signature=378270

Related

python Scrapy return nothing from the website

I want to crawl the website: https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3 user review part, but it return an empty result, the path of the selector is correct.
import scrapy
from scrapy import Selector,Request
class LaptopSpider(scrapy.Spider):
name = 'cs'
def start_requests(self):
url =' https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3'
yield Request(url,callback=self.parse)
def parse(self, response):
products_selector = response.css('#productRecap > div.p--z > div:nth-child(3) > div > div > div > div > div.Oi-z > div::text').get()
print(products_selector)
Try this to get the reviews from the link in your post:
import scrapy
class ZapposSpider(scrapy.Spider):
name = 'zappos'
link = 'https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3'
base_url = 'https://api.prod.cassiopeia.ugc.zappos.com/display/v2/reviews'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
params = {
'offset': '0',
'page': '1',
'productId': '',
'sort': 'upVotes:desc,overallRating:desc,reviewDate:desc'
}
def start_requests(self):
product_id = self.link.split("product/")[1].split("/")[0]
self.params['productId'] = product_id
yield scrapy.FormRequest(
url=self.base_url,
headers=self.headers,
callback= self.parse,
method="GET",
formdata=self.params,
)
def parse(self, response):
for item in response.json()['reviews']:
reviewer = item['name']
review = item['summary']
yield {"reviewer":reviewer,"review":review}

Xpath not found using scrapy

I want to extract email and phone but I could not find the xpath for it. I would only retrieve the xpath of website, this is the link of the page where I extracted the data: https://www.fiduciairesuisse-vd.ch/directory/abc-gestion-sa
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link
}
Now,it's working.
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link,
'phone':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]//text()[2])').get(),
'email':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]/div[1]//text()[2])').get()
}

Trouble outputting data with Scrapy

I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()

Python web scraping with 'scrapy': not extracting text from span

The problem is in the line 54
port = data.xpath('./td[3]/span[#class="port"]/text()').extract_first()
Full code
import json
import re
import urllib
from html.parser import HTMLParser
from urllib.parse import urljoin
from scrapy import Field, Item, Selector
from scrapy.http import FormRequest, HtmlResponse, Request
from scrapy.spiders import CrawlSpider
from requests import Session
class ProxyServersPro(Item):
ip = Field()
port = Field()
country = Field()
speed = Field()
protocol = Field()
anon = Field()
lastcheck = Field()
speed = Field()
class ProxyServers(CrawlSpider):
name = "ProxyServersProCrawler"
allowed_domains = ["proxyservers.pro"]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
start_url = [
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1",
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/2",
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/3",
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/4",
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/5",
]
def __init__(self):
super(ProxyServers, self).__init__()
def start_requests(self):
for url in self.start_url:
yield Request(url, callback=self.parse_companies, headers=self.headers)
def parse_companies(self, response):
table = response.xpath('//table[#class="table table-hover"]/tbody/tr')
for data in table:
ip = data.xpath("./td[2]/a/text()").extract_first()
country = data.xpath("./td[4]/text()").extract_first()
protocol = data.xpath("./td[7]/text()").extract_first()
anon = data.xpath("./td[8]/text()").extract_first()
port = data.xpath('./td[3]/span[#class="port"]/text()').extract_first()
item = ProxyServersPro()
item["ip"] = ip
item["country"] = country
item["protocol"] = protocol
item["anon"] = anon
item["port"] = port
yield item
Scrapy is not extracting value because there is no text present in the span.
Try this:
(Pdb) data.xpath('./td[3]/span/#data-port').extract_first()
'050F040E'

Scrapy Nested request

I am using Python 3.5.2 and Scrapy 1.1.
There is a nested request in the demo below, specifically, in the page of article content, there is an ajax request which gets the author when he logs in.
I wrote it as follow, I can't get the author and I have a hard time understanding where the problem is.
demo:
# -*- coding: utf-8 -*-
import scrapy
from demo.items import ExampleItem
from scrapy.spiders import CrawlSpider
import re
class ExampleSpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/articles-list.php?page=1",
"http://www.example.com/articles-list.php?page=2",
"http://www.example.com/articles-list.php?page=3",
"http://www.example.com/articles-list.php?page=4",
"http://www.example.com/articles-list.php?page=5",
"http://www.example.com/articles-list.php?page=6",
]
headers = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Cookie':'PHPSESSID=12345370000029b72333333dc999999; QS[uid]=100; QS[username]=example; QS[password]=example.com; QS[pmscount]=1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2774.3 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
def parse(self, response):
hrefs = response.xpath('a/#href')
for href in hrefs:
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_article_contents)
def parse_article_contents(self, response):
for sel in response.xpath('/html/body'):
item = ExampleItem()
item['articleUrl'] = response.url
item['title'] = sel.xpath('div[3]/a[2]/#href')[0].extract()
item['content'] = sel.xpath('div[2]/div[2]/div[1]/text()')[0].extract()
#In the page of artile content,there is an ajax request,which get the auther when login.
articleId = re.search(u'id=(\d{1,4})&', item['articleUrl']).group(1)
articleAuthorUrl = 'http://www.example.com/plus/ajax_author.php?id=' + articleId
#Crawling auther below.Is it correct?
def request_article_author(self):
return scrapy.Request(url=articleAuthorUrl,headers=headers,callback=self.parse_article_author)
def parse_article_author(self, response):
item['author'] = response.xpath('/html/body/div/div[1]/div[2]/text()').extract()
# item['author'] can be yielded var "yield item" below?
yield item
Any ideas?

Categories

Resources