Extracting Most Read Titles with BS4

Extracting Most Read Titles with BS4 - python

I want to extract the titles in the Most Read section of a news page. This is what I have so far, but I'm getting all the titles. I just want the ones in the Most Read section.
`
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.michigandaily.com/section/opinion'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, "html5lib")
for story_heading in soup.find_all(class_= "views-field views-field-title"):
if story_heading.a:
print(story_heading.a.text.replace("\n", " ").strip())
else:
print(story_heading.contents[0].strip())`

You need to limit your scope to only the div container for the most read articles.
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.michigandaily.com/section/opinion'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, "html5lib")
most_read_soup = soup.find_all('div', {'class': 'view-id-most_read'})[0]
for story_heading in most_read_soup.find_all(class_= "views-field views-field-title"):
if story_heading.a:
print(story_heading.a.text.replace("\n", " ").strip())
else:
print(story_heading.contents[0].strip())

You can use a css selector to get the specific tags from the top most read div:
from bs4 import BeautifulSoup
base_url = 'https://www.michigandaily.com/section/opinion'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, "html5lib")
css = "div.pane-most-read-panel-pane-1 a"
links = [a.text.strip() for a in soup.select(css)]
Which will give you:
[u'Michigan in Color: Anotha One', u'Migos trio ends 2016 SpringFest with Hill Auditorium concert', u'Migos dabs their way to a seminal moment for Ann Arbor hip hop', u'Best of Ann Arbor 2016', u'Best of Ann Arbor 2016: Full List']

Related

Is there any way to fix bad currency read?

I would like to scrape prices of items on steam community market with my python and beautifulsoup but when I scrape one the currency ends up as swedish crowns is there any way to change that to eur?
from bs4 import BeautifulSoup
import requests
html_text = requests .get('https://steamcommunity.com/market/listings/730/USP-S%20%7C%20Blueprint%20%28Factory%20New%29').text
soup = BeautifulSoup(html_text, 'lxml')
skin_name = soup.find('span', class_ = 'market_listing_item_name').text
sm = soup.find('span', class_ = 'market_listing_price').text.replace(' ','sm ')
print(skin_name, sm)

how can I find each link as a string from html page with Beautiful Soup ? ( findAll function is not finding well for this website)

I want to add all Dota2 hero names; they are appeared as links, from https://dota2.gamepedia.com/Abaddon/Counters into a list.
here is my test code:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://dota2.gamepedia.com/Abaddon/Counters'
print(Child_url+hero_link_list[0])
uClient = uReq(my_url)
page_html = uClient.read()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div", {"class": "mw-parser-output"})
print(containers)
But, after printing the containers variable, almost all information under this div tag is missing, and just some comments are added only.
I don't have any idea why this is happening. after this step, I have an idea to scrape the links but first I need to add hole information to containers.

This script will print all here names:
import requests
from bs4 import BeautifulSoup
url = 'https://dota2.gamepedia.com/Abaddon/Counters'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_heros = [a.text for a in soup.select('b > a')]
#print them:
print(*all_heros, sep='\n')
Prints:
Ancient Apparition
Axe
Brewmaster
Doom
Lina
Lion
Mars
Outworld Devourer
Silencer
Shadow Demon
Death Prophet
Mirana
Bane
Batrider
Beastmaster
Chen
Techies
Bloodseeker
Necrophos
Nyx Assassin
Storm Spirit
Phantom Assassin
Io
Axe
Legion Commander
Centaur Warrunner
Oracle
EDIT (To scrape the categories, you can use .find_previous() function):
import requests
from bs4 import BeautifulSoup
url = 'https://dota2.gamepedia.com/Abaddon/Counters'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
data = {}
for hero in soup.select('b > a'):
data.setdefault(hero.find_previous(class_='mw-headline').text, []).append(hero.text)
#print them:
from pprint import pprint
pprint(data)
Prints:
{'Bad against...': ['Ancient Apparition',
'Axe',
'Brewmaster',
'Doom',
'Lina',
'Lion',
'Mars',
'Outworld Devourer',
'Silencer',
'Shadow Demon'],
'Good against...': ['Death Prophet',
'Mirana',
'Bane',
'Batrider',
'Beastmaster',
'Chen',
'Techies',
'Bloodseeker',
'Necrophos',
'Nyx Assassin'],
'Works well with...': ['Storm Spirit',
'Phantom Assassin',
'Io',
'Axe',
'Legion Commander',
'Centaur Warrunner',
'Oracle']}

BeautifulSoup and scraping href's isn't working

Again I am having trouble scraping href's in BeautifulSoup. I have a list of pages that I am scraping and I have the data but I can't seem to get the hrefs even when I use various codes that work in other scripts.
So here is the code and my data will be below that:
import requests
from bs4 import BeautifulSoup
with open('states_names.csv', 'r') as reader:
states = [states.strip().replace(' ', '-') for states in reader]
url = 'https://www.hauntedplaces.org/state/alabama'
for state in states:
page = requests.get(url+state)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.findAll('div', class_='description')
# When I try to add .get('href') I get a traceback error. Am I trying to scrape the href too early?
h_page = soup.findAll('h3')
<h3>Gaines Ridge Dinner Club</h3>
<h3>Purifoy-Lipscomb House</h3>
<h3>Kate Shepard House Bed and Breakfast</h3>
<h3>Cedarhurst Mansion</h3>
<h3>Crybaby Bridge</h3>
<h3>Gaineswood Plantation</h3>
<h3>Mountain View Hospital</h3>

This works perfectly:
from bs4 import BeautifulSoup
import requests
url = 'https://www.hauntedplaces.org/state/Alabama'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
for link in soup.select('div.description a'):
print(link['href'])

Try that:
soup = BeautifulSoup(page.content, 'html.parser')
list0 = []
possible_links = soup.find_all('a')
for link in possible_links:
if link.has_attr('href'):
print (link.attrs['href'])
list0.append(link.attrs['href'])
print(list0)

Scraping the news (Python 3.6, BeautifulSoup)

I want to scrape spiegel.de/schlagzeilen to get all the news-stuff which is shown below the dates (today, yesterday, to days ago).
<div class="schlagzeilen-content schlagzeilen-overview">
contains what I want, I think, but there is one problem left:
print(data)
keeps out the data I need, but in addition it comes with a bunch of phrases I don't want (like names of the integrated modules/ parts of HTML/CSS and so on)
So I chose
for item in data:
print(item.text)
This one has a very pretty output(!), but now I miss the article-URL, which is important to have. Is there anybody who can help me out? Here is my code:
from bs4 import BeautifulSoup
import requests
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
data = soup.find_all("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
for item in data:
print(item.text)

You could use CSS selector to find all article links:
from bs4 import BeautifulSoup
import requests
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
# data = soup.find_all("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
links = soup.select('div.schlagzeilen-content a')
for item in links:
print item.text, website + item['href']
Some output:
Bayern: Sechs Tote in Gartenlaube - keine Hinweise auf Gewaltverbrechen http://spiegel.de/schlagzeilen/panorama/justiz/tote-in-gartenlaube-keine-hinweise-auf-gewaltverbrechen-a-1132268.html
Starbucks, Tesla, GE: Trumps Einreiseverbot beunruhigt US-Konzerne http://spiegel.de/schlagzeilen/wirtschaft/soziales/donald-trump-und-das-einreiseverbot-us-konzerne-zeigen-sich-besorgt-a-1132262.html
...

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
div = soup.find("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
for a in div.find_all('a', title=True):
link = urljoin(website, a.get('href'))
print(a.text, a.find_next_sibling('span').text)
print(link)
out:
Südafrika: Dutzende Patienten sterben nach Verlegung (Panorama, 13:09)
http://spiegel.de/panorama/gesellschaft/suedafrika-verlegung-in-privatkliniken-dutzende-patienten-gestorben-a-1132677.html
Trumps Stotterstart: Ein Präsident, so unbeliebt wie keiner zuvor (Politik, 12:59)
http://spiegel.de/politik/ausland/donald-trump-als-us-praesident-so-unbeliebt-wie-kein-vorgaenger-a-1132554.html
Kontrolle von Gefährdern: Kabinett beschließt elektronische Fußfessel (Politik, 12:33)
The tag you need is a tag and it's sibling span which contains (Netzwelt, 12:23), so just use find_all and use a tag as an anchor.
And if you want the full path of the url, use urljoin

Scraped Span Returns None Get_Text() Python Beautiful Soup

I've scraped links to cars and now wish to follow the links and scrape some data about each car but my code returns an empty array (None if i print individually). Any ideas how to fix this?
import bs4 as bs
import urllib
source = urllib.request.urlopen('http://www.25thstauto.com/inventory.aspx?cursort=asc&pagesize=500').read()
soup = bs.BeautifulSoup(source, 'lxml')
car = soup.select('a[id*=ctl00_cphBody_inv1_rptInventoryNew]')
for a in car:
source2 = urllib.request.urlopen('http://www.25thstauto.com/'+a.get('href')).read()
price.append(soup.find('span', {'id': 'ctl00_cphBody_inv1_lblPrice'}))
print(price)

import bs4 as bs
import urllib
source = urllib.request.urlopen('http://www.25thstauto.com/inventory.aspx?cursort=asc&pagesize=500').read()
soup = bs.BeautifulSoup(source, 'lxml')
price = []
car = soup.select('a[id*=ctl00_cphBody_inv1_rptInventoryNew]')
for a in car:
source2 = urllib.request.urlopen('http://www.25thstauto.com/'+a.get('href')).read()
# make a new soup baesd on the link, do not use old soup
soup2 = bs.BeautifulSoup(source2, 'lxml')
price.append(soup2.find('span', {'id': 'ctl00_cphBody_inv1_lblPrice'}))
print(price)
out:
[<span id="ctl00_cphBody_inv1_lblPrice">$2,995</span>]
[<span id="ctl00_cphBody_inv1_lblPrice">$2,995</span>, <span id="ctl00_cphBody_inv1_lblPrice">$2,995</span>]
[<span id="ctl00_cphBody_inv1_lblPrice">$2,995</span>, <span id="ctl00_cphBody_inv1_lblPrice">$2,995</span>, <span id="ctl00_cphBody_inv1_lblPrice">$2,995</span>]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extracting Most Read Titles with BS4 - python

Related

Is there any way to fix bad currency read?

how can I find each link as a string from html page with Beautiful Soup ? ( findAll function is not finding well for this website)

BeautifulSoup and scraping href's isn't working

Scraping the news (Python 3.6, BeautifulSoup)

Scraped Span Returns None Get_Text() Python Beautiful Soup

Categories

Resources