Webscraping Youtube pages - python

I'm trying to web scrape a youtube channel name via a link. But I get the error code:
title = response.find_all('div', class_= "style-scope ytd-channel-name")
AttributeError: 'Response' object has no attribute 'find_all'
Link to site: https://www.youtube.com/channel/UCHOgE8XeaCjlgvH0t01fVZg
Code:
url = 'https://www.youtube.com/channel/UCHOgE8XeaCjlgvH0t01fVZg'
response = requests.get(url)
title = response.find_all('div', class_= "style-scope ytd-channel-name")
soup = BeautifulSoup(title.text, 'lxml')
print(soup)
Thank you!

We can use this.
from requests_html import HTMLSession
from bs4 import BeautifulSoup as bs # importing BeautifulSoup
video_url = "https://www.youtube.com/channel/UCHOgE8XeaCjlgvH0t01fVZg"
# init an HTML Session
session = HTMLSession()
# get the html content
response = session.get(video_url)
# execute Java-script
response.html.render(sleep=1)
# create bs object to parse HTML
soup = bs(response.html.html, "html.parser")
name = soup.find('yt-formatted-string', class_='style-scope ytd-channel-name')
print(name.text)
Output:-
TheTekkitRealm

The following code returns the divs:
url = "https://www.youtube.com/channel/UCHOgE8XeaCjlgvH0t01fVZg"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
print(soup.div)
The value being returned can be changed through the 'soup.' value (e.g. soup.title).
I link to the documentation because I think it would be useful for you to look at too:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

Related

BeautifulSoup find() function returning None because I am not getting correct html info

I am trying to webscrape an Amazon web product page and print out the productTitle. Code below:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import smtplib
def extract():
headers = {my_info}
url = "link"
# this link is to a webpage on Amazon
req = requests.get(url, headers=headers)
soup1 = BeautifulSoup(req.content, "html.parser") # get soup1 from website
soup2 = BeautifulSoup(soup1.prettify(), 'html.parser') # prettify it in soup 2
print(soup2) # when i inspect element on website, it shows that there is a html tag <span> with
id=productTitle
# title = soup2.find('span', id='productTitle').get_text() # find() is returning None
print(soup2.prettify())
I was expecting the html content that i inspected element from directly on the website to be the same as in soup2, but for some reason it's not, which is causing find() to return None. How come the html is not the same? Any help would be appreciated.

Webscraping Google news page: getting AttributeError: 'NoneType' object has no attribute 'find_all'

I'm trying to web-scrape the Google News page for a personal project and retrieve the article headlines to print out onto another page. I've been searching for any typos or mistakes but I'm not sure why my element keeps returning as "None" when I try to print it.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.google.com/search?q=beyond+meat&rlz=1C1CHBF_enUS898US898&sxsrf=ALeKk00IH9jp1Kz5-LSyi7FUB4rd6--_hw:1624935518812&source=lnms&tbm=nws&sa=X&ved=2ahUKEwicqIbD7LvxAhVWo54KHXgRA9oQ_AUoAXoECAEQAw&biw=1536&bih=754'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find('div', id='rso') #grabs everything in column
article_results = results.find_all('div', class_='yr3B8d KWQBje') #grabs divs surrounding each article
for article_result in article_results:
headliner = article_result.find('div', class_='JheGif nDgy9d')#grabs article header div for every article
if None in (headliner):
continue
headliner_text = headliner.text.strip()
print()
import requests
from bs4 import BeautifulSoup
URL = 'https://www.google.com/search?q=beyond+meat&rlz=1C1CHBF_enUS898US898&sxsrf=ALeKk00IH9jp1Kz5-LSyi7FUB4rd6--_hw:1624935518812&source=lnms&tbm=nws&sa=X&ved=2ahUKEwicqIbD7LvxAhVWo54KHXgRA9oQ_AUoAXoECAEQAw&biw=1536&bih=754'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
headers = soup.find_all('div', class_='BNeawe vvjwJb AP7Wnd')
for h in headers:
print(h.text)
Refer the output... Is this what you are expecting?

'NoneType' object is not callable in Beautiful Soup 4

I'm new-ish to python and started experimenting with Beautiful Soup 4. I tried writing code that would get all the links on one page then with those links repeat the prosses until I have an entire website parsed.
import bs4 as bs
import urllib.request as url
links_unclean = []
links_clean = []
soup = bs.BeautifulSoup(url.urlopen('https://pythonprogramming.net/parsememcparseface/').read(), 'html.parser')
for url in soup.find_all('a'):
print(url.get('href'))
links_unclean.append(url.get('href'))
for link in links_unclean:
if (link[:8] == 'https://'):
links_clean.append(link)
print(links_clean)
while True:
for link in links_clean:
soup = bs.BeautifulSoup(url.urlopen(link).read(), 'html.parser')
for url in soup.find_all('a'):
print(url.get('href'))
links_unclean.append(url.get('href'))
for link in links_unclean:
if (link[:8] == 'https://'):
links_clean.append(link)
links_clean = list(dict.fromkeys(links_clean))
input()
But I'm now getting this error:
'NoneType' object is not callable
line 20, in
soup = bs.BeautifulSoup(url.urlopen(link).read(),
'html.parser')
Can you pls help.
Be careful when importing modules as something. In this case, url on line 2 gets overridden in your for loop when you iterate.
Here is a shorter solution that will also give back only URLs containing https as part of the href attribute:
from bs4 import BeautifulSoup
from urllib.request import urlopen
content = urlopen('https://pythonprogramming.net/parsememcparseface/')
soup = BeautifulSoup(content, "html.parser")
base = soup.find('body')
for link in BeautifulSoup(str(base), "html.parser").findAll("a"):
if 'href' in link.attrs:
if 'https' in link['href']:
print(link['href'])
However, this paints an incomplete picture as not all links are captured because of errors on the page with HTML tags. May I recommend also the following alternative, which is very simple and works flawlessly in your scenario (note: you will need the package Requests-HTML):
from requests_html import HTML, HTMLSession
session = HTMLSession()
r = session.get('https://pythonprogramming.net/parsememcparseface/')
for link in r.html.absolute_links:
print(link)
This will output all URLs, including both those that reference other URLs on the same domain and those that are external websites.
I would consider using an attribute = value css selector and using the ^ operator to specify that the href attributes begin with https. You will then only have valid protocols. Also, use set comprehensions to ensure no duplicates and Session to re-use connection.
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
final = []
with requests.Session() as s:
r = s.get('https://pythonprogramming.net/parsememcparseface/')
soup = bs(r.content, 'lxml')
httpsLinks = {item['href'] for item in soup.select('[href^=https]')}
for link in httpsLinks:
r = s.get(link)
soup = bs(r.content, 'lxml')
newHttpsLinks = [item['href'] for item in soup.select('[href^=https]')]
final.append(newHttpsLinks)
tidyList = list({item for sublist in final for item in sublist})
df = pd.DataFrame(tidyList)
print(df)

Python3 : BeautifulSoup4 not returning expected value

I'm currently trying to scrap some data over a website using BS4 under python 3.6.4 but the value returned is not what I am expecting:
import requests
from bs4 import BeautifulSoup
link = "https://www.lacentrale.fr/listing?makesModelsCommercialNames=FERRARI&sortBy=priceAsc"
request = requests.get(link)
page = request.content
soup = BeautifulSoup(page, "html5lib")
price = soup.find("div", {"class" : "fieldPrice sizeC"}).text
print(price)
I should get "39 900 €" but the code return "47 880 â¬".
NB: Even without JS, the data should be "39 900 €".
Thanks for your help !
The encoding declaration is wrong on this page so BeautifulSoup gets told to use the wrong encoding. You can force it to use the correct encoding like this:
import requests
from bs4 import BeautifulSoup
link = "https://www.lacentrale.fr/listing?makesModelsCommercialNames=FERRARI&sortBy=priceAsc"
request = requests.get(link)
page = request.content
soup = BeautifulSoup(page.decode('utf-8','ignore'), "html5lib")
price = soup.find("div", {"class": "fieldPrice sizeC"}).text
print(price)
Outputs:
49 070 €
Instead of page.content use page.text
Ex:
import requests
from bs4 import BeautifulSoup
link = "https://www.lacentrale.fr/listing?makesModelsCommercialNames=FERRARI&sortBy=priceAsc"
request = requests.get(link)
page = request.text
soup = BeautifulSoup(page, "html.parser")
price = soup.find("div", {"class" : "fieldPrice sizeC"}).text
print(price)
.text automatically decode content from the server

BeautifulSoup : Fetched all the links on a webpage how to navigate through them without selenium?

So I'm trying to write a mediocre script to download subtitles from one particular website as y'all can see. I'm a newbie to beautifulsoup, so far I have a list of all the "href" after a search query(GET). So how do I navigate further, after getting all the links?
Here's the code:
import requests
from bs4 import BeautifulSoup
usearch = input("Movie Name? : ")
url = "https://www.yifysubtitles.com/search?q="+usearch
print(url)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
for link in soup.find_all('a'):
dictn = link.get('href')
print(dictn)
You need to use resp.text instead of resp.content
Try this to get the search results.
import requests
from bs4 import BeautifulSoup
base_url_f = "https://www.yifysubtitles.com"
search_url = base_url_f + "/search?q=last+jedi"
resp = requests.get(search_url)
soup = BeautifulSoup(resp.text, 'lxml')
for media in soup.find_all("div", {"class": "media-body"}):
print(base_url_f + media.find('a')['href'])
out: https://www.yifysubtitles.com/movie-imdb/tt2527336

Categories

Resources