I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()
Related
I am new at scrapy. I want to scrap data from alibaba.com but I'm getting none. I don't know where is the problem. Here is my code
class IndiaSpider(scrapy.Spider):
name = 'india'
allowed_domains = ['indiamart.com']
# search_value = 'car'
start_urls = [f'https://dir.indiamart.com/search.mp?ss=laptop&prdsrc=1&res=RC4']
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
def request_header(self):
yield scrapy.Request(url=self.start_urls, callback=self.parse, headers={'User-Agent':self.user_agent})
def parse(self, response):
title = response.xpath("//span[#class='elps elps2 p10b0 fs14 tac mListNme']/a/text()").get()
related_link = response.xpath("//span[#class='elps elps2 p10b0 fs14 tac mListNme']/a/#href").get()
yield{
'titling':title,
'rel_link':related_link
}
And I am getting
2023-02-14 15:20:34 [scrapy.core.scraper] DEBUG: Scraped from <200 https://dir.indiamart.com/search.mp?ss=car&prdsrc=1&res=RC4>
{'titling': None, 'rel_link': None, 'images': []}
2023-02-14 15:20:34 [scrapy.core.engine] INFO: Closing spider (finished)
I was getting results yesterday, and it is working good but today it returns none. it is not javascript based website. I tried more than one time but returns same
As #SuperUser told you, the spider gets None because the site uses Javascript to render the product information. If you disable Javascript in your browser and reload the page, you will see that the products are not displayed.
However you can get the information from one of the <script> tags.
import scrapy
import json
class AlibabaSpider(scrapy.Spider):
name = "alibaba"
allowed_domains = ["alibaba.com"]
search_value = "laptop"
start_urls = [f"https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&tab=all&SearchText={search_value}"]
def parse(self, response):
raw_data = response.xpath("//script[contains(., 'window.__page__data__config')]/text()").extract_first()
raw_data = raw_data.replace("window.__page__data__config = ", "").replace("window.__page__data = window.__page__data__config.props", "")
data = json.loads(raw_data)
title = data["props"]["offerResultData"]["offerList"][0]["information"]["puretitle"]
yield {"title": title} # Laptops Laptop Cheapest OEM Core I5...
I'm building a Scrapy crawler/spider for a determined website where I will send scrapy a starting url (let's call it start_urls) and it will get the response of all the urls (based on determined parameters) that are contained on it
Summarizing: It should enter into start_url and then search for the company_urlsĀ given an allow parameter in the rule and return the response of each company url (everything done with headers). My code is only parsing the start_urls, not considering the urls I want to extract for the parser. What could be wrong?
LinkExtractor
link_extractor = LinkExtractor(
allow=['/organization/'],
allow_domains=['www.scrapsite.com'],
deny_extensions=IGNORED_EXTENSIONS, # Filter *.zip, *.csv, etc (add other extensions as required)
process_value=lambda url: process_url_value(url, NAME, cleaning_regex=[company_regex]),
)
ScrapySpider
class scrapsiteSpider(CrawlSpider):
name = NAME
download_delay = 5.0
main_page = MAIN_PAGE
HEADERS = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
}
start_urls = [
f'https://www.scrapsite.com/search/companies/field/companies/company_page/{rank}'
for rank in range(75, 132, 10)
rules = [Rule(link_extractor, callback='parse', follow=True)]
headers = HEADERS
#classmethod
def start_requests(cls):
logger.info('Starting scrapsite scraping')
for url in cls.start_urls:
cls.log_counter += 1
if cls.log_counter % cls.log_divider == 0:
logger.info(f'Start request: {url}')
yield Request(url, dont_filter=True, headers=HEADERS)
#classmethod
def parse(cls, response: Response):
# CAPTURE COMPANIES
logger.info(f"#### parse PREPROCESSING company {response.url}")
logger.info(f"{response.meta}")
if company_regex.search(response.url):
logger.info(f"Company Detected: {response.url.split('/')[-1]}")
return cls.parse_item(response, AddedItem())
#classmethod
def parse_item(cls, response: Response, item: Item) -> Item:
logger.info(f"#### parse_item PREPROCESSING company {response.url}")
item.set_url(value=response.url)
item.set_source(value=cls.name)
item.set_response_data(value=response.text)
item.set_uuid(value=make_id_from_url(url=response.url))
yield item
I have a problem with going to next page they will go to next page but then they will again return to first page and they will give only the data of page 1 I have trying different approches but I am not successfull to solve these problem if any solution then provide me this is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
title=response.xpath("//span[#id='HeadingContent_lblTitle']//text()").get()
d1=response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1=d1.strip()
d2=response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2=d2.strip()
d3=response.xpath("//div[#class='col-md-10']//p[3]//span//text()").get()
d3=d3.strip()
d4=response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4=d4.strip()
yield{
"title1":title,
"title2":d1,
"title3":d2,
"title4":d3,
"title5":d4,
}
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath("//a[#id='MainContent_PagerTop_NavNext']")
try:
next.click()
# get the data and write it to scrapy items
except:
break
I want to extract email and phone but I could not find the xpath for it. I would only retrieve the xpath of website, this is the link of the page where I extracted the data: https://www.fiduciairesuisse-vd.ch/directory/abc-gestion-sa
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All§ion=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link
}
Now,it's working.
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All§ion=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link,
'phone':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]//text()[2])').get(),
'email':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]/div[1]//text()[2])').get()
}
I am using Python 3.5.2 and Scrapy 1.1.
There is a nested request in the demo below, specifically, in the page of article content, there is an ajax request which gets the author when he logs in.
I wrote it as follow, I can't get the author and I have a hard time understanding where the problem is.
demo:
# -*- coding: utf-8 -*-
import scrapy
from demo.items import ExampleItem
from scrapy.spiders import CrawlSpider
import re
class ExampleSpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/articles-list.php?page=1",
"http://www.example.com/articles-list.php?page=2",
"http://www.example.com/articles-list.php?page=3",
"http://www.example.com/articles-list.php?page=4",
"http://www.example.com/articles-list.php?page=5",
"http://www.example.com/articles-list.php?page=6",
]
headers = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Cookie':'PHPSESSID=12345370000029b72333333dc999999; QS[uid]=100; QS[username]=example; QS[password]=example.com; QS[pmscount]=1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2774.3 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
def parse(self, response):
hrefs = response.xpath('a/#href')
for href in hrefs:
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_article_contents)
def parse_article_contents(self, response):
for sel in response.xpath('/html/body'):
item = ExampleItem()
item['articleUrl'] = response.url
item['title'] = sel.xpath('div[3]/a[2]/#href')[0].extract()
item['content'] = sel.xpath('div[2]/div[2]/div[1]/text()')[0].extract()
#In the page of artile content,there is an ajax request,which get the auther when login.
articleId = re.search(u'id=(\d{1,4})&', item['articleUrl']).group(1)
articleAuthorUrl = 'http://www.example.com/plus/ajax_author.php?id=' + articleId
#Crawling auther below.Is it correct?
def request_article_author(self):
return scrapy.Request(url=articleAuthorUrl,headers=headers,callback=self.parse_article_author)
def parse_article_author(self, response):
item['author'] = response.xpath('/html/body/div/div[1]/div[2]/text()').extract()
# item['author'] can be yielded var "yield item" below?
yield item
Any ideas?