How to skip one page from iterations? - python

How can skip one iteration of spider if the webpage contains some data?
Page titles:
We have several page title on pages. I skip other data (dates, likes).
page 1 title: 'We like cats' # this title is valid
page 2 title: 'This title contains WORD X...' # this title is not valid (skip it)
page 3 title: 'Best ideas' # this title is valid
Code:
from scrapy.spider import CrawlSpider
class Carflix(CrawlSpider):
name = 'carflix'
allowed_domains = ['sitex.com']
start_urls = ['http://sitex.com/page-1.html',
'http://sitex.com/page-2.html',
'http://sitex.com/page-2.html']
def parse(self, response):
date = response.xpath('//div[#class="date"]/text()').extract_first()
pagetitle = response.xpath('//div[#class="title"]/text()').extract_first()
if 'WORD X' in pagetitle:
# what need to do that skip adding data if page title contains 'WORD X'
likes = response.xpath('//div[#class="likes"]/text()').extract_first()
yield{
'pagetitle': pagetitle,
'date': date,
'likes': likes,
}
The result should be:
[{
'pagetitle': 'We like cats',
'date': '01/01/2019',
'likes': 200
},
{
'pagetitle': 'Best ideas',
'date': '02/01/2019',
'likes': 100
}]```

Just yield your results under your specified condition:
def parse(self, response):
date = response.xpath('//div[#class="date"]/text()').extract_first()
pagetitle = response.xpath('//div[#class="title"]/text()').extract_first()
likes = response.xpath('//div[#class="likes"]/text()').extract_first()
if not 'WORD X' in pagetitle:
yield {
'pagetitle': pagetitle,
'date': date,
'likes': likes,
}

Related

How to fix this error during scraping using BeautifulSoup?

I am trying to do web scraping using BeautifulSoup and requests Python library. I want to filter the news titles from Hacker News website but its showing an error while implementing.
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline a')
subtext = soup.select('.subtext')
def create_custom_hn(links, subtext):
hn = []
for index, item in enumerate(links):
title = links[index].getText()
href = links[index].get('href', None)
votes = subtext[index].select('.score')
if len(votes):
points = int(votes[0].getText().replace(' points', ''))
print(points)
hn.append({'title': title, 'href': href})
return hn
print(create_custom_hn(links, subtext))
The error says
votes = subtext[index].select('.score')
~~~~~~~^^^^^^^
IndexError: list index out of range
Here is fixed version of the code from the question:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://news.ycombinator.com/news")
soup = BeautifulSoup(res.text, "html.parser")
links = soup.select(".titleline > a")
def create_custom_hn(links):
hn = []
for link in links:
title = link.getText()
href = link.get("href", None)
votes = link.find_next(class_="score")
points = int(votes.getText().replace(" points", ""))
hn.append({"title": title, "href": href, "points": points})
return hn
print(create_custom_hn(links))
Prints:
[
{
"title": "Urllib3 in 2022",
"href": "https://sethmlarson.dev/urllib3-in-2022",
"points": 97,
},
{
"title": "First public release of Pushup: a new compiler for making web apps in Go",
"href": "https://github.com/adhocteam/pushup",
"points": 18,
},
{
"title": "Intelligence – A good collection of great OSINT Resources",
"href": "https://github.com/ARPSyndicate/awesome-intelligence",
"points": 113,
},
{
"title": "Microsoft is preparing to add ChatGPT to Bing",
"href": "https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter",
"points": 760,
},
...and so on.
Try to select your elements more specific, your selection of soup.select('.titleline a') includes more elements (60) as you may like to select (30):
[Urllib3 in 2022,
<span class="sitestr">sethmlarson.dev</span>,...]
I would also recommend to iterate the elements in another way, so you would become able to handle missing values.
Example
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text)
data = []
for e in soup.select('tr.athing'):
data.append({
'title':e.select_one('.titleline a').get_text(),
'url':e.select_one('.titleline a').get('href'),
'votes':e.find_next(class_='subtext').text.split()[0]
})
print(data)
Output
[{'title': 'Urllib3 in 2022', 'url': 'https://sethmlarson.dev/urllib3-in-2022', 'votes': '93'}, {'title': 'First public release of Pushup: a new compiler for making web apps in Go', 'url': 'https://github.com/adhocteam/pushup', 'votes': '16'}, {'title': 'Intelligence – A good collection of great OSINT Resources', 'url': 'https://github.com/ARPSyndicate/awesome-intelligence', 'votes': '109'}, {'title': 'Microsoft is preparing to add ChatGPT to Bing', 'url': 'https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter', 'votes': '755'}, {'title': 'Juan Tamariz, the godfather of close-up card magic', 'url': 'https://www.nytimes.com/2023/01/02/magazine/juan-tamariz-magic.html', 'votes': '31'}, {'title': 'The Expanding Dark Forest and Generative AI', 'url': 'https://maggieappleton.com/ai-dark-forest', 'votes': '223'}, {'title': 'Irreconcilable differences between local and distributed computing (1994)', 'url': 'https://scholar.harvard.edu/waldo/publications/note-distributed-computing', 'votes': '29'},...]

Iterate and extract info over div class

import requests
from bs4 import BeautifulSoup
url = "https://boulder.noshdelivery.co/restaurants"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
restaurant_wrapper = soup.find(class_ = "dd_rest_list")
restaurants = restaurant_wrapper.find_all(class_="menu__vendor-name")
restaurant_street_address = restaurant_wrapper.find("span", itemprop="streetAddress")
restaurant_address_locality = restaurant_wrapper.find("span", itemprop="addressLocality")
def extract_restaurant_data(restaurant):
restaurant_title = restaurant_wrapper.find(class_="menu__vendor-name")
return {
"title" : restaurant_title.text.strip(),
"streetAddress": restaurant_street_address.text.strip(),
"addressLocality": restaurant_address_locality.text.strip()
}
results = [extract_restaurant_data(restaurant) for restaurant in restaurants]
print(results)
I would like to know why this code, prints exactly the same info and does not iterate over the list of restaurants.
My output is this
{'title': '5280 Cafe At Rallysport', 'streetAddress': '2727 29th St.',
'addressLocality': 'Boulder'},
{'title': '5280 Cafe At Rallysport', 'streetAddress': '2727 29th St.', ' addressLocality': 'Boulder'}........
The info is the same. I do not know why my code does not iterate over the different names from the list of "restaurants"
You only did one find the data. Do a find_all on each section and then zip them together!
restaurant_details = zip(
restaurant_wrapper.find_all(class_="menu__vendor-name"),
restaurant_wrapper.find_all("span", itemprop="streetAddress"),
restaurant_wrapper.find_all("span", itemprop="addressLocality"),
)
results = [
{
"title": title.text.strip(),
"streetAddress": street_address.text.strip(),
"addressLocality": address_locality.text.strip()
}
for title, street_address, address_locality in restaurant_details
]
print(results)
You function has restaurant_wrapper.find(class_="menu__vendor-name") written in it, so each time it runs it would print only the first occurence of the class menu__vendor-name.
To print a new restaurant's detail in each iteration you would have to access each web element individually.
The code below would allow you to get the details for all restaurants.
restwords = restaurant_wrapper.find_all("div", {"class": "dd_restwords"})
def extract_restaurant_data(restaurant):
title = restaurant.find("div", {"class": "menu__vendor-name"}).text
streetAddress = restaurant.find("span", {"itemprop": "streetAddress"}).text
addressLocality = restaurant.find("span", {"itemprop": "addressLocality"}).text
rest_data = {
"title": title,
"streetAddress": streetAddress,
"addressLocality": addressLocality
}
return rest_data
for restaurant in restwords:
print(extract_restaurant_data(restaurant))

Scraping data from a site, and it returns nothing?

i'm trying to scrape some data from a site called laced.co.uk, and i'm a tad confused on whats going wrong. i'm new to this, so try and explain it simply (if possible please!). Here is my code ;
from bs4 import BeautifulSoup
import requests
url = "https://www.laced.co.uk/products/nike-dunk-low-retro-black-white?size=7"
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
prices = doc.find_all(text=" £195 ")
print(prices)
thank you! (the price at time of posting was 195 (it showed as the size 7 buy now price on the page).
The price is loaded within a <script> tag on the page:
<script>
typeof(dataLayer) != "undefined" && dataLayer.push({
'event': 'eec.productDetailImpression',
'page': {
'ecomm_prodid': 'DD1391-100'
},
'ecommerce': {
'detail': {
'actionField': {'list': 'Product Page'},
'products': [{
'name': 'Nike Dunk Low Retro Black White',
'id': 'DD1391-100',
'price': '195.0',
'brand': 'Nike',
'category': 'Dunk, Dunk Low, Mens Nike Dunks',
'variant': 'White',
'list': 'Product Page',
'dimension1': '195.0',
'dimension2': '7',
'dimension3': '190',
'dimension4': '332'
}]
}
}
});
</script>
You can use a regular expression pattern to search for the price. Note, there's no need for BeautifulSoup:
import re
import requests
url = "https://www.laced.co.uk/products/nike-dunk-low-retro-black-white?size=7"
result = requests.get(url)
price = re.search(r"'price': '(.*?)',", result.text).group(1)
print(f"£ {price}")

How can I only parse the first HTML block from multiple blocks, if they all contain the same class-name?

I need to parse info from a site, on this site, there are 2 blocks, "Today" and "Yesterday", and they have the same class name of standard-box standard-list.
How can I only parse the first block (under "Today") in a row, without extracting the inform from "Yesterday", if they both contain the same class-name?
Here is my code:
import requests
url_news = "https://www.123.org/"
response = requests.get(url_news)
soup = BeautifulSoup(response.content, "html.parser")
items = soup.findAll("div", class_="standard-box standard-list")
news_info = []
for item in items:
news_info.append({
"title": item.find("div", class_="newstext",).text,
"link": item.find("a", class_="newsline article").get("href")
})
When running your provided code, I don't get an output for items. However, you said that you do, so:
If you only want to get the data under "Today", you can use .find() instead of .find_all(), since .find() will only return the first found tag -- which is "Today" and not the other tags.
So, instead of:
items = soup.findAll("div", class_="standard-box standard-list")
Use:
items = soup.find("div", class_="standard-box standard-list")
Additionally, to find the link, I needed to access the attribute using tag-name[attribute]. Here is working code:
news_info = []
items = soup.find("div", class_="standard-box standard-list")
for item in items:
news_info.append(
{"title": item.find("div", class_="newstext").text, "link": item["href"]}
)
print(news_info)
Output:
[{'title': 'NIP crack top 3 ranking for the first time in 5 years', 'link': 'https://www.hltv.org/news/32545/nip-crack-top-3-ranking-for-the-first-time-in-5-years'}, {'title': 'Fessor joins Astralis Talent', 'link': 'https://www.hltv.org/news/32544/fessor-joins-astralis-talent'}, {'title': 'Grashog joins AGO', 'link': 'https://www.hltv.org/news/32542/grashog-joins-ago'}, {'title': 'ISSAA parts ways with Eternal Fire', 'link': 'https://www.hltv.org/news/32543/issaa-parts-ways-with-eternal-fire'}, {'title': 'BLAST Premier Fall Showdown Fantasy live', 'link': 'https://www.hltv.org/news/32541/blast-premier-fall-showdown-fantasy-live'}, {'title': 'FURIA win IEM Fall NA, EG claim final Major Legends spot', 'link': 'https://www.hltv.org/news/32540/furia-win-iem-fall-na-eg-claim-final-major-legends-spot'}]

XPath: Nth TD within a TR

Crawling the following page: http://graphics.stltoday.com/apps/payrolls/salaries/teachers/detail/25074/ and I'm trying to grab each value from the table (salary, job title, years with district, etc). When I attempt to access these from scrapy shell, they're all displaying when I use response.xpath('//th[#scope="row"]/following-sibling::td[1]/text()').extract() However, when I do this within the crawler, only the first element (district) displays. Any suggestions?
Crawler code (Ideally, each element would go into its own variable for cleaner output:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class Spider2(CrawlSpider):
#name of the spider
name = 'stlteacher'
#list of allowed domains
allowed_domains = ['graphics.stltoday.com']
#starting url for scraping
start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/teachers/']
rules = [
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/[0-9]+/position/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/detail/[0-9]+/$']),
callback='parse_item',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/stlteachers3.csv'
}
def parse_item(self, response):
#Remove XML namespaces
response.selector.remove_namespaces()
#Extract article information
url = response.url
name = response.xpath('//p[#class="table__title"]/text()').extract()
district = response.xpath('//th[#scope="row"]/following-sibling::td[1]/text()').extract()
for item in zip(name, district):
scraped_info = {
'url' : url,
'name' : item[0],
'district' : item[1],
}
yield scraped_info
Your zip is a bit confusing there. If you want to crawl the whole table then you need to iterate through table rows and find row name and value.
I got pretty good results with this piece of code:
def parse_item(self, response):
name = response.xpath('//p[#class="table__title"]/text()').extract_first()
item = {
'name': name,
'url': response.url
}
for row in response.xpath('//th[#scope="row"]'):
row_name = row.xpath('text()').extract_first('').lower().strip(':')
row_value = row.xpath('following-sibling::td[1]/text()').extract_first()
item[row_name] = row_value
yield item
This returns:
{
'name': 'Bracht, Nathan',
'url': 'http://graphics.stltoday.com/apps/payrolls/salaries/teachers/detail/25074/',
'district': 'Affton 101',
'school': 'Central Office',
'position': 'Central Office Admin.',
'degree earned': 'Doct',
'salary': '$152,000.00',
'extended contract pay': None,
'extra duty pay': None,
'total pay (all combined)': '$152,000.00',
'years in district': '5',
'years in mo schools': '19',
'multiple position detail': None
}

Categories

Resources