I am trying to do web scraping using BeautifulSoup and requests Python library. I want to filter the news titles from Hacker News website but its showing an error while implementing.
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline a')
subtext = soup.select('.subtext')
def create_custom_hn(links, subtext):
hn = []
for index, item in enumerate(links):
title = links[index].getText()
href = links[index].get('href', None)
votes = subtext[index].select('.score')
if len(votes):
points = int(votes[0].getText().replace(' points', ''))
print(points)
hn.append({'title': title, 'href': href})
return hn
print(create_custom_hn(links, subtext))
The error says
votes = subtext[index].select('.score')
~~~~~~~^^^^^^^
IndexError: list index out of range
Here is fixed version of the code from the question:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://news.ycombinator.com/news")
soup = BeautifulSoup(res.text, "html.parser")
links = soup.select(".titleline > a")
def create_custom_hn(links):
hn = []
for link in links:
title = link.getText()
href = link.get("href", None)
votes = link.find_next(class_="score")
points = int(votes.getText().replace(" points", ""))
hn.append({"title": title, "href": href, "points": points})
return hn
print(create_custom_hn(links))
Prints:
[
{
"title": "Urllib3 in 2022",
"href": "https://sethmlarson.dev/urllib3-in-2022",
"points": 97,
},
{
"title": "First public release of Pushup: a new compiler for making web apps in Go",
"href": "https://github.com/adhocteam/pushup",
"points": 18,
},
{
"title": "Intelligence – A good collection of great OSINT Resources",
"href": "https://github.com/ARPSyndicate/awesome-intelligence",
"points": 113,
},
{
"title": "Microsoft is preparing to add ChatGPT to Bing",
"href": "https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter",
"points": 760,
},
...and so on.
Try to select your elements more specific, your selection of soup.select('.titleline a') includes more elements (60) as you may like to select (30):
[Urllib3 in 2022,
<span class="sitestr">sethmlarson.dev</span>,...]
I would also recommend to iterate the elements in another way, so you would become able to handle missing values.
Example
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text)
data = []
for e in soup.select('tr.athing'):
data.append({
'title':e.select_one('.titleline a').get_text(),
'url':e.select_one('.titleline a').get('href'),
'votes':e.find_next(class_='subtext').text.split()[0]
})
print(data)
Output
[{'title': 'Urllib3 in 2022', 'url': 'https://sethmlarson.dev/urllib3-in-2022', 'votes': '93'}, {'title': 'First public release of Pushup: a new compiler for making web apps in Go', 'url': 'https://github.com/adhocteam/pushup', 'votes': '16'}, {'title': 'Intelligence – A good collection of great OSINT Resources', 'url': 'https://github.com/ARPSyndicate/awesome-intelligence', 'votes': '109'}, {'title': 'Microsoft is preparing to add ChatGPT to Bing', 'url': 'https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter', 'votes': '755'}, {'title': 'Juan Tamariz, the godfather of close-up card magic', 'url': 'https://www.nytimes.com/2023/01/02/magazine/juan-tamariz-magic.html', 'votes': '31'}, {'title': 'The Expanding Dark Forest and Generative AI', 'url': 'https://maggieappleton.com/ai-dark-forest', 'votes': '223'}, {'title': 'Irreconcilable differences between local and distributed computing (1994)', 'url': 'https://scholar.harvard.edu/waldo/publications/note-distributed-computing', 'votes': '29'},...]
Related
I need to scrape datafrom a link. The required data is hidden within another link on the webpage.
Something similar to the webpage I am working is this link - College List. Say I need to get data about each college listed in this site. First, I land on this page. Then I extract all relevant links on this page and subsequent other pages. Then I go to each link and get relevant data.
I am not able to get the desired list of links and how to go to next page and do the same thing?
What I have tried so far is -
import requests
import lxml.html as lh
url = 'https://www.indiacollegeshub.com/colleges/'
page = requests.get(url)
doc = lh.fromstring(page.content)
tr_elements = doc.xpath('//*[#id="ContentPlaceHolder1_pnl_collegelist"]/ul/li[1]')
col=[]
for t in tr_elements[0]: # starting from 2nd row for column headers
name=t.text_content()
col.append(name)
print(col) #gives me string values and not links
print(tr_elements[0].xpath('//a/#href')) # Gives me all links. I need links within Div [#id="ContentPlaceHolder1_pnl_collegelist"] only.
I am not able to get the required link list by page. I think there are some 2K+ pages in this site.
Thanks in advance.
I used beautiful soup to scrape the site.
import requests
from bs4 import BeautifulSoup
data = []
url = f"https://www.indiacollegeshub.com/colleges/"
print(f"Scraping {url} ...")
page = requests.get(url)
page.raise_for_status()
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find("div", class_="clg-lists").find("ul")
assert table, "table not found"
for item in table.find_all("a", href=True):
data.append({
"link": item["href"],
"text": item.text.strip(),
})
print(data)
# Returns in format
# [
# {
# "link": "https://www.indiacollegeshub.com/colleges/iifa-lancaster-degree-college-bangalore.aspx",
# "text": "IIFA Lancaster Degree College, Bangalore\n#5,14/2,Suvarna Jyothi Layout,\xa0Jnanabharathi post, Nagadevanahalli, Bangalore - Karnataka, India\nPhone : +91 9845984211,+91 7349241005, Landline No:08023241999",
# }, {
# "link": "https://www.indiacollegeshub.com/colleges/iifa-multimedia-bangalore.aspx",
# "text": "IIFA Multimedia, Bangalore\n#262 80 feet main road srinivasa nagar,\xa09th main corner, Bangalore - Karnataka, India\nPhone : 080 48659176, +91 7349241004,+91 9845006824",
# },
# ...
# ]
Outputs:
Scraping https://www.indiacollegeshub.com/colleges/ ...
[{'link': 'https://www.indiacollegeshub.com/colleges/iifa-lancaster-degree-college-bangalore.aspx', 'text': 'IIFA Lancaster Degree College, Bangalore\n#5,14/2,Suvarna Jyothi Layout,\xa0Jnanabharathi post, Nagadevanahalli, Bangalore - Karnataka, India\nPhone : +91 9845984211,+91 7349241005, Landline No:08023241999'}, {'link': 'https://www.indiacollegeshub.com/colleges/iifa-multimedia-bangalore.aspx', 'text': 'IIFA Multimedia, Bangalore\n#262 80 feet main road srinivasa nagar,\xa09th main corner, Bangalore - Karnataka, India\nPhone : 080 48659176, +91 7349241004,+91 9845006824'}, {'link': 'https://www.indiacollegeshub.com/colleges/3-berhampur-college-berhampur.aspx', 'text': '+3 Berhampur College, Berhampur\nRaj Berhampur Berhampur - Orissa, India\nPhone : N/A'}, {'link': 'https://www.indiacollegeshub.com/colleges/3-panchayat-samiti-mahavidyalaya-balangir.aspx', 'text': '+3 Panchayat Samiti Mahavidyalaya, Balangir\nGyana Vihar Deogaon Balangir - Orissa, India\nPhone : N/A'}, {'link': 'https://www.indiacollegeshub.com/colleges/21st-century-international-school-trust-sivagangai.aspx', 'text': '21St Century International School Trust, Sivagangai\nRani Velu Nachiar Nagar, Kangirangal Post, Sivagangai Sivagangai - Tamil Nadu, India\nPhone : 04575 - 244930'}, {'link': 'https://www.indiacollegeshub.com/colleges/3dfx-animation-school-kochi.aspx', 'text': '3DFX Animation School, Kochi\nNear MP Office, Kattuparambil Towers, Old Market Road, Angamaly, Kochi - Kerala, India\nPhone : 91 0484 2455799'}, {'link': 'https://www.indiacollegeshub.com/colleges/4-g-fire-college-sonipat.aspx', 'text': '4 G Fire College, Sonipat\nK.C. Plaza, 1ST FLOWER,Above Eye Q Hospita , Atlas Road, Near State Bank of India, Sonipat - Haryana, India\nPhone : 9466769467, 7206220706'}, {'link': 'https://www.indiacollegeshub.com/colleges/5-gates-multimedia-solutions-indore.aspx', 'text': '5 Gates Multimedia Solutions, Indore\n102 Krtgya Tower, 8, Janki Nagar, A.b. Road, Indore - Madhya Pradesh, India\nPhone : (0731) 2400656'}, {'link': 'https://www.indiacollegeshub.com/colleges/a-a-arts-and-science-college-chennai.aspx', 'text': 'A A Arts And Science College, Chennai\n42/1, Srinivasan Nagar, Iind Street, Koyambedu Chennai - Tamil Nadu, India\nPhone : 044-28553109, 28526202'}, {'link': 'https://www.indiacollegeshub.com/colleges/a-a-govt-arts-college-attur-salem.aspx', 'text': 'A A Govt Arts College Attur, Salem\nSalem Salem - Tamil Nadu, India\nPhone : N/A'}]
If you want to scrape all of the 2k+ pages, you need to use multithreading to scrape site faster. I used the code on this article. Don't forget to replace variale NUM_THREADS with your number of threads. I highly recommend to writing the output into a file while the program is scraping.
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
# REPLACE WITH YOUR NUMBER OF THREADS
NUM_THREADS = 8
links = [f"https://www.indiacollegeshub.com/colleges/page-{index}.aspx" for index in range(1, 2480)] # 1 - 2479
data = []
def scrape(url):
print(f"Scraping {url} ...")
page = requests.get(url)
page.raise_for_status()
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find("div", class_="clg-lists").find("ul")
assert table, "table not found"
for item in table.find_all("a", href=True):
data.append({
"link": item["href"],
"text": item.text.strip(),
})
start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
executor.map(scrape, links)
total_time = time.time() - start_time
print(total_time)
import requests
from bs4 import BeautifulSoup
url = "https://boulder.noshdelivery.co/restaurants"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
restaurant_wrapper = soup.find(class_ = "dd_rest_list")
restaurants = restaurant_wrapper.find_all(class_="menu__vendor-name")
restaurant_street_address = restaurant_wrapper.find("span", itemprop="streetAddress")
restaurant_address_locality = restaurant_wrapper.find("span", itemprop="addressLocality")
def extract_restaurant_data(restaurant):
restaurant_title = restaurant_wrapper.find(class_="menu__vendor-name")
return {
"title" : restaurant_title.text.strip(),
"streetAddress": restaurant_street_address.text.strip(),
"addressLocality": restaurant_address_locality.text.strip()
}
results = [extract_restaurant_data(restaurant) for restaurant in restaurants]
print(results)
I would like to know why this code, prints exactly the same info and does not iterate over the list of restaurants.
My output is this
{'title': '5280 Cafe At Rallysport', 'streetAddress': '2727 29th St.',
'addressLocality': 'Boulder'},
{'title': '5280 Cafe At Rallysport', 'streetAddress': '2727 29th St.', ' addressLocality': 'Boulder'}........
The info is the same. I do not know why my code does not iterate over the different names from the list of "restaurants"
You only did one find the data. Do a find_all on each section and then zip them together!
restaurant_details = zip(
restaurant_wrapper.find_all(class_="menu__vendor-name"),
restaurant_wrapper.find_all("span", itemprop="streetAddress"),
restaurant_wrapper.find_all("span", itemprop="addressLocality"),
)
results = [
{
"title": title.text.strip(),
"streetAddress": street_address.text.strip(),
"addressLocality": address_locality.text.strip()
}
for title, street_address, address_locality in restaurant_details
]
print(results)
You function has restaurant_wrapper.find(class_="menu__vendor-name") written in it, so each time it runs it would print only the first occurence of the class menu__vendor-name.
To print a new restaurant's detail in each iteration you would have to access each web element individually.
The code below would allow you to get the details for all restaurants.
restwords = restaurant_wrapper.find_all("div", {"class": "dd_restwords"})
def extract_restaurant_data(restaurant):
title = restaurant.find("div", {"class": "menu__vendor-name"}).text
streetAddress = restaurant.find("span", {"itemprop": "streetAddress"}).text
addressLocality = restaurant.find("span", {"itemprop": "addressLocality"}).text
rest_data = {
"title": title,
"streetAddress": streetAddress,
"addressLocality": addressLocality
}
return rest_data
for restaurant in restwords:
print(extract_restaurant_data(restaurant))
i'm trying to scrape some data from a site called laced.co.uk, and i'm a tad confused on whats going wrong. i'm new to this, so try and explain it simply (if possible please!). Here is my code ;
from bs4 import BeautifulSoup
import requests
url = "https://www.laced.co.uk/products/nike-dunk-low-retro-black-white?size=7"
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
prices = doc.find_all(text=" £195 ")
print(prices)
thank you! (the price at time of posting was 195 (it showed as the size 7 buy now price on the page).
The price is loaded within a <script> tag on the page:
<script>
typeof(dataLayer) != "undefined" && dataLayer.push({
'event': 'eec.productDetailImpression',
'page': {
'ecomm_prodid': 'DD1391-100'
},
'ecommerce': {
'detail': {
'actionField': {'list': 'Product Page'},
'products': [{
'name': 'Nike Dunk Low Retro Black White',
'id': 'DD1391-100',
'price': '195.0',
'brand': 'Nike',
'category': 'Dunk, Dunk Low, Mens Nike Dunks',
'variant': 'White',
'list': 'Product Page',
'dimension1': '195.0',
'dimension2': '7',
'dimension3': '190',
'dimension4': '332'
}]
}
}
});
</script>
You can use a regular expression pattern to search for the price. Note, there's no need for BeautifulSoup:
import re
import requests
url = "https://www.laced.co.uk/products/nike-dunk-low-retro-black-white?size=7"
result = requests.get(url)
price = re.search(r"'price': '(.*?)',", result.text).group(1)
print(f"£ {price}")
I need to parse info from a site, on this site, there are 2 blocks, "Today" and "Yesterday", and they have the same class name of standard-box standard-list.
How can I only parse the first block (under "Today") in a row, without extracting the inform from "Yesterday", if they both contain the same class-name?
Here is my code:
import requests
url_news = "https://www.123.org/"
response = requests.get(url_news)
soup = BeautifulSoup(response.content, "html.parser")
items = soup.findAll("div", class_="standard-box standard-list")
news_info = []
for item in items:
news_info.append({
"title": item.find("div", class_="newstext",).text,
"link": item.find("a", class_="newsline article").get("href")
})
When running your provided code, I don't get an output for items. However, you said that you do, so:
If you only want to get the data under "Today", you can use .find() instead of .find_all(), since .find() will only return the first found tag -- which is "Today" and not the other tags.
So, instead of:
items = soup.findAll("div", class_="standard-box standard-list")
Use:
items = soup.find("div", class_="standard-box standard-list")
Additionally, to find the link, I needed to access the attribute using tag-name[attribute]. Here is working code:
news_info = []
items = soup.find("div", class_="standard-box standard-list")
for item in items:
news_info.append(
{"title": item.find("div", class_="newstext").text, "link": item["href"]}
)
print(news_info)
Output:
[{'title': 'NIP crack top 3 ranking for the first time in 5 years', 'link': 'https://www.hltv.org/news/32545/nip-crack-top-3-ranking-for-the-first-time-in-5-years'}, {'title': 'Fessor joins Astralis Talent', 'link': 'https://www.hltv.org/news/32544/fessor-joins-astralis-talent'}, {'title': 'Grashog joins AGO', 'link': 'https://www.hltv.org/news/32542/grashog-joins-ago'}, {'title': 'ISSAA parts ways with Eternal Fire', 'link': 'https://www.hltv.org/news/32543/issaa-parts-ways-with-eternal-fire'}, {'title': 'BLAST Premier Fall Showdown Fantasy live', 'link': 'https://www.hltv.org/news/32541/blast-premier-fall-showdown-fantasy-live'}, {'title': 'FURIA win IEM Fall NA, EG claim final Major Legends spot', 'link': 'https://www.hltv.org/news/32540/furia-win-iem-fall-na-eg-claim-final-major-legends-spot'}]
How can skip one iteration of spider if the webpage contains some data?
Page titles:
We have several page title on pages. I skip other data (dates, likes).
page 1 title: 'We like cats' # this title is valid
page 2 title: 'This title contains WORD X...' # this title is not valid (skip it)
page 3 title: 'Best ideas' # this title is valid
Code:
from scrapy.spider import CrawlSpider
class Carflix(CrawlSpider):
name = 'carflix'
allowed_domains = ['sitex.com']
start_urls = ['http://sitex.com/page-1.html',
'http://sitex.com/page-2.html',
'http://sitex.com/page-2.html']
def parse(self, response):
date = response.xpath('//div[#class="date"]/text()').extract_first()
pagetitle = response.xpath('//div[#class="title"]/text()').extract_first()
if 'WORD X' in pagetitle:
# what need to do that skip adding data if page title contains 'WORD X'
likes = response.xpath('//div[#class="likes"]/text()').extract_first()
yield{
'pagetitle': pagetitle,
'date': date,
'likes': likes,
}
The result should be:
[{
'pagetitle': 'We like cats',
'date': '01/01/2019',
'likes': 200
},
{
'pagetitle': 'Best ideas',
'date': '02/01/2019',
'likes': 100
}]```
Just yield your results under your specified condition:
def parse(self, response):
date = response.xpath('//div[#class="date"]/text()').extract_first()
pagetitle = response.xpath('//div[#class="title"]/text()').extract_first()
likes = response.xpath('//div[#class="likes"]/text()').extract_first()
if not 'WORD X' in pagetitle:
yield {
'pagetitle': pagetitle,
'date': date,
'likes': likes,
}