Scraping the news (Python 3.6, BeautifulSoup) - python

I want to scrape spiegel.de/schlagzeilen to get all the news-stuff which is shown below the dates (today, yesterday, to days ago).
<div class="schlagzeilen-content schlagzeilen-overview">
contains what I want, I think, but there is one problem left:
print(data)
keeps out the data I need, but in addition it comes with a bunch of phrases I don't want (like names of the integrated modules/ parts of HTML/CSS and so on)
So I chose
for item in data:
print(item.text)
This one has a very pretty output(!), but now I miss the article-URL, which is important to have. Is there anybody who can help me out? Here is my code:
from bs4 import BeautifulSoup
import requests
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
data = soup.find_all("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
for item in data:
print(item.text)

You could use CSS selector to find all article links:
from bs4 import BeautifulSoup
import requests
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
# data = soup.find_all("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
links = soup.select('div.schlagzeilen-content a')
for item in links:
print item.text, website + item['href']
Some output:
Bayern: Sechs Tote in Gartenlaube - keine Hinweise auf Gewaltverbrechen http://spiegel.de/schlagzeilen/panorama/justiz/tote-in-gartenlaube-keine-hinweise-auf-gewaltverbrechen-a-1132268.html
Starbucks, Tesla, GE: Trumps Einreiseverbot beunruhigt US-Konzerne http://spiegel.de/schlagzeilen/wirtschaft/soziales/donald-trump-und-das-einreiseverbot-us-konzerne-zeigen-sich-besorgt-a-1132262.html
...

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
div = soup.find("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
for a in div.find_all('a', title=True):
link = urljoin(website, a.get('href'))
print(a.text, a.find_next_sibling('span').text)
print(link)
out:
Südafrika: Dutzende Patienten sterben nach Verlegung (Panorama, 13:09)
http://spiegel.de/panorama/gesellschaft/suedafrika-verlegung-in-privatkliniken-dutzende-patienten-gestorben-a-1132677.html
Trumps Stotterstart: Ein Präsident, so unbeliebt wie keiner zuvor (Politik, 12:59)
http://spiegel.de/politik/ausland/donald-trump-als-us-praesident-so-unbeliebt-wie-kein-vorgaenger-a-1132554.html
Kontrolle von Gefährdern: Kabinett beschließt elektronische Fußfessel (Politik, 12:33)
The tag you need is a tag and it's sibling span which contains (Netzwelt, 12:23), so just use find_all and use a tag as an anchor.
And if you want the full path of the url, use urljoin

Related

How can I extract href links from a within a table th using BeautifulSoup

I am trying to create a list of all football teams/links from any one of a number of tables within the base URL: https://fbref.com/en/comps/10/stats/Championship-Stats
I would then use the link from the href to scrape each individual team's data. The href is embedded within the th tag as per below
th scope="row" class="left " data-stat="squad">Barnsley</th
a href="/en/squads/293cb36b/Barnsley-Stats">Barnsley</a
The following code gives me a list of the 'a' tags
page = "https://fbref.com/en/comps/10/Championship-Stats"
pageTree = requests.get(page)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
Teams = pageSoup.find_all("th", {"class": "left"})
Output(for each class of 'left'):
th class="left" data-stat="squad" scope="row">
a href="/en/squads/293cb36b/Barnsley-Stats">Barnsley,
I have tried the guidance from a previous Stack question (Extract links after th in beautifulsoup)
However, the following code based on that thread produces errors
AttributeError: 'NoneType' object has no attribute 'find_parent'
def import_TeamList():
BASE_URL = "https://fbref.com/en/comps/10/Championship-Stats"
r = requests.get(BASE_URL)
soup = BeautifulSoup(r.text, 'lxml')
team_list = []
team_tr = soup.find('a', {'data-stat': 'squad'}).find_parent('tr')
for tr in reels_tr.find_next_siblings('tr'):
if tr.find('a').text != 'squad':
break
midi_list.append(BASE_URL + tr.find('a')['href'])
return TeamList
Here is a version using CSS selectors, which I find simpler than most other methods.
import requests
from bs4 import BeautifulSoup
url = 'https://fbref.com/en/comps/10/stats/Championship-Stats'
data = requests.get(url).text
soup = BeautifulSoup(data)
links = BeautifulSoup(data).select('th a')
urls = [link['href'] for link in links]
print(urls)
Is this what you're looking for?
import requests
from bs4 import BeautifulSoup as BS
from lxml import etree
with requests.Session() as session:
r = session.get('https://fbref.com/en/comps/10/stats/Championship-Stats')
r.raise_for_status()
dom = etree.HTML(str(BS(r.text, 'lxml')))
for a in dom.xpath('//th[#class="left"]/a'):
print(a.attrib['href'])

BeautifulSoup find() takes no keyword arguments error

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import sys
query_txt = input("크롤링할 내용 입력 :")
path = "C:\Temp\chromedriver_240\chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get("https://www.naver.com")
time.sleep(2)
driver.find_element_by_id("query").send_keys(query_txt)
driver.find_element_by_id("search_btn").click()
driver.find_element_by_link_text("블로그 더보기").click()
full_html = driver.page_source
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.find('ul', id='elThumbnailResultArea')
print(content_list)
content = content_list.find('a','sh_blog_title _sp_each_url _sp_each_title' ).get_text()
print(content)
for i in content_list:
con = i.find('a', class_='sh_blog_title _sp_each_url _sp_each_title').get_text()
print(con)
print('\n')
i typed this code with watching online learning but in loop it always error.
con = i.find('a', class_='sh_blog_title _sp_each_url _sp_each_title').get_text()
this line show error 'find() takes no keyword arguments'
The problem is, you have to use .find_all() to get all <a> tags. .find() only returns one tag (if there's any):
import requests
from bs4 import BeautifulSoup
url = 'https://search.naver.com/search.naver?query=tree&where=post&sm=tab_nmr&nso='
full_html = requests.get(url).content
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.find_all('a', class_='sh_blog_title _sp_each_url _sp_each_title' )
for i in content_list:
print(i.text)
print('\n')
Prints:
[2017/공학설계 입문] Romantic Tree
장충동/Banyan Tree Club & Spa/Club Members Restaurant
2020-06-27 Joshua Tree National Park Camping(조슈아트리...
[결혼준비/D-102] 웨딩밴드 '누니주얼리 - like a tree'
Book Club - Magic Tree House # 1 : Dinosaur Before Dark...
비밀 정원, 조슈아 트리 국립공원(Joshua Tree National Park)
그뤼너씨 TEA TREE 티트리 라인 3종리뷰
Number of Nodes in the Sub-Tree With the Same Label
태국의 100년 넘은 Giant tree
[부산 기장 카페] 오션뷰 뷰맛집카페 : 씨앤트리 sea&tree
use .find('a', attrs={"class": "<Class name>"}) instead. Reference: Beatifulsoup docs
These two links will definitely help you.
Understand the Find() function in Beautiful Soup
Find on beautiful soup in loop returns TypeError

Python Scraping empty tag

I have a problem with scraping some element from a page:
https://tuning-tec.com/mercedes_w164_ml_mklasa_0507_black_led_seq_lpmed0-5789i
code:
import requests
from bs4 import BeautifulSoup
URL="https://tuning-tec.com/mercedes_w164_ml_mklasa_0507_black_led_seq_lpmed0-5789i"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
title=soup.find(class_="product_cart_title").text
price=soup.find(class_="icon_main_block_price_a")
number=soup.find(class_="product_cart_info").findAll('tr')[1].findAll('td')[1]
description=soup.find(id="tab_a")
print(description)
Problem is when I want to get to: tab_a
And its a problem cause inside
<div align="left" class="product_cart_info" id="charlong_id">
</div>
is empty. How I can get it?
I see its about js i think. Maybe there is some delay when the page loads?
As stated in the comments, the info is loaded via JavaScript, so BeautifulSoup doesn't see it. But you if you look to Chrome/Firefox network tab, you can see where the page is making requests:
import re
import requests
from bs4 import BeautifulSoup
url = 'https://tuning-tec.com/mercedes_w164_ml_mklasa_0507_black_led_seq_lpmed0-5789i'
ajax_url = 'https://tuning-tec.com/_template/_show_normal/_show_charlong.php?itemId={}'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print(soup.select_one('.product_cart_title').get_text(strip=True))
print(soup.select_one('.icon_main_block_price_a').get_text(strip=True))
print(soup.select_one('td:contains("Symbol") ~ td').get_text(strip=True))
item_id = re.findall(r"ajax_update_stat\('(\d+)'\)", soup.text)[0]
soup2 = BeautifulSoup(requests.get(ajax_url.format(item_id)).content, 'html.parser')
print()
# just print some info:
for tr in soup2.select('tr'):
print(re.sub(r' {2,}', ' ', tr.select_one('td').get_text(strip=True, separator=' ')))
Prints:
MERCEDES W164 ML M-KLASA 05-07 BLACK LED SEQ
1788.62 PLN
LPMED0
PL
Opis
Lampy
soczewkowe ze światłem
pozycyjnym LED. Z dynamicznym
kierunkowskazem. 100% nowe, w komplecie
(lewa i prawa). Homologacja: norma E13 -
dopuszczone do ruchu.
Szczegóły
Światła pozycyjne: DIODY Kierunkowskaz: DIODY Światła
mijania: H9 w
zestawie Światła
drogowe: H1 w
zestawie Regulacja: elektryczna (silniczek znajduje się w
komplecie).
LED TUBE LIGHT Dynamic Turn Signal >>
A little change in the description, I don't know if it's working, have a look on the following code:
import re
import requests
from bs4 import BeautifulSoup
url = 'https://tuning-tec.com/mercedes_w164_ml_mklasa_0507_black_led_seq_lpmed0-5789i'
ajax_url = 'https://tuning-tec.com/_template/_show_normal/_show_charlong.php?itemId={}'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
def unwrapElements(soup, elementsToFind):
elements = soup.find_all(elementsToFind)
for element in elements:
element.unwrap()
print(soup.select_one('.product_cart_title').get_text(strip=True))
print(soup.select_one('.icon_main_block_price_a').get_text(strip=True))
print(soup.select_one('td:contains("Symbol") ~ td').get_text(strip=True))
item_id = re.findall(r"ajax_update_stat\('(\d+)'\)", soup.text)[0]
soup2 = BeautifulSoup(requests.get(ajax_url.format(item_id)).content, 'html.parser')
description=soup2.findAll('tr')[2].findAll('td')[1]
description.append(soup2.findAll('tr')[4].findAll('td')[1])
unwrapElements(description, "td")
unwrapElements(description, "font")
unwrapElements(description, "span")
print(description)
I need just these elements of description in English language. It will be OK?
And anyway thanks for help !!
Only one thing i don't know why he didn't remove all

BeautifulSoup and scraping href's isn't working

Again I am having trouble scraping href's in BeautifulSoup. I have a list of pages that I am scraping and I have the data but I can't seem to get the hrefs even when I use various codes that work in other scripts.
So here is the code and my data will be below that:
import requests
from bs4 import BeautifulSoup
with open('states_names.csv', 'r') as reader:
states = [states.strip().replace(' ', '-') for states in reader]
url = 'https://www.hauntedplaces.org/state/alabama'
for state in states:
page = requests.get(url+state)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.findAll('div', class_='description')
# When I try to add .get('href') I get a traceback error. Am I trying to scrape the href too early?
h_page = soup.findAll('h3')
<h3>Gaines Ridge Dinner Club</h3>
<h3>Purifoy-Lipscomb House</h3>
<h3>Kate Shepard House Bed and Breakfast</h3>
<h3>Cedarhurst Mansion</h3>
<h3>Crybaby Bridge</h3>
<h3>Gaineswood Plantation</h3>
<h3>Mountain View Hospital</h3>
This works perfectly:
from bs4 import BeautifulSoup
import requests
url = 'https://www.hauntedplaces.org/state/Alabama'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
for link in soup.select('div.description a'):
print(link['href'])
Try that:
soup = BeautifulSoup(page.content, 'html.parser')
list0 = []
possible_links = soup.find_all('a')
for link in possible_links:
if link.has_attr('href'):
print (link.attrs['href'])
list0.append(link.attrs['href'])
print(list0)

Extracting Most Read Titles with BS4

I want to extract the titles in the Most Read section of a news page. This is what I have so far, but I'm getting all the titles. I just want the ones in the Most Read section.
`
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.michigandaily.com/section/opinion'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, "html5lib")
for story_heading in soup.find_all(class_= "views-field views-field-title"):
if story_heading.a:
print(story_heading.a.text.replace("\n", " ").strip())
else:
print(story_heading.contents[0].strip())`
You need to limit your scope to only the div container for the most read articles.
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.michigandaily.com/section/opinion'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, "html5lib")
most_read_soup = soup.find_all('div', {'class': 'view-id-most_read'})[0]
for story_heading in most_read_soup.find_all(class_= "views-field views-field-title"):
if story_heading.a:
print(story_heading.a.text.replace("\n", " ").strip())
else:
print(story_heading.contents[0].strip())
You can use a css selector to get the specific tags from the top most read div:
from bs4 import BeautifulSoup
base_url = 'https://www.michigandaily.com/section/opinion'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, "html5lib")
css = "div.pane-most-read-panel-pane-1 a"
links = [a.text.strip() for a in soup.select(css)]
Which will give you:
[u'Michigan in Color: Anotha One', u'Migos trio ends 2016 SpringFest with Hill Auditorium concert', u'Migos dabs their way to a seminal moment for Ann Arbor hip hop', u'Best of Ann Arbor 2016', u'Best of Ann Arbor 2016: Full List']

Categories

Resources