I ran similar code on another website and it works, but on opensubtitle.org I'm having a problem! I don't know why it is not able to recognize href (the link I need) and titles.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.opensubtitles.org/it/search/sublanguageid-eng/searchonlymovies-on/genre-horror/movielanguage-english/moviecountry-usa/subformat-srt/hd-on/offset-4040'
def scarica_pagina(link):
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
cnt=0
for film in soup.find(id="search_results").find_all("td"):
cnt=cnt+1
link = film.find("a")["href"]
title = film.find("a").text
#genres = film.find("i").text
print(link)
if __name__ == '__main__':
scarica_pagina(URL)
all you need is to follow the DOM correctly:
1-first choose the table with id = 'search_results'.
2-find all td tags with class name 'sb_star_odd' or 'sb_star_even'.
3-find all('a')[0]['href'] for the link you want.
4-find all('a')[0].text for the title you want.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.opensubtitles.org/it/search/sublanguageid-eng/searchonlymovies-on/genre-horror/movielanguage-english/moviecountry-usa/subformat-srt/hd-on/offset-4040'
def scarica_pagina(link):
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
cnt=0
for film in soup.find(id="search_results").find_all('td',class_=re.compile('^sb*')):
cnt=cnt+1
link = film.find_all('a')[0]['href']
title = film.find_all('a')[0].text
print(link)
if __name__ == '__main__':
scarica_pagina(URL)
you were using find instead find_all that what caused your problem.
Related
I'm trying to web-scrape the Google News page for a personal project and retrieve the article headlines to print out onto another page. I've been searching for any typos or mistakes but I'm not sure why my element keeps returning as "None" when I try to print it.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.google.com/search?q=beyond+meat&rlz=1C1CHBF_enUS898US898&sxsrf=ALeKk00IH9jp1Kz5-LSyi7FUB4rd6--_hw:1624935518812&source=lnms&tbm=nws&sa=X&ved=2ahUKEwicqIbD7LvxAhVWo54KHXgRA9oQ_AUoAXoECAEQAw&biw=1536&bih=754'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find('div', id='rso') #grabs everything in column
article_results = results.find_all('div', class_='yr3B8d KWQBje') #grabs divs surrounding each article
for article_result in article_results:
headliner = article_result.find('div', class_='JheGif nDgy9d')#grabs article header div for every article
if None in (headliner):
continue
headliner_text = headliner.text.strip()
print()
import requests
from bs4 import BeautifulSoup
URL = 'https://www.google.com/search?q=beyond+meat&rlz=1C1CHBF_enUS898US898&sxsrf=ALeKk00IH9jp1Kz5-LSyi7FUB4rd6--_hw:1624935518812&source=lnms&tbm=nws&sa=X&ved=2ahUKEwicqIbD7LvxAhVWo54KHXgRA9oQ_AUoAXoECAEQAw&biw=1536&bih=754'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
headers = soup.find_all('div', class_='BNeawe vvjwJb AP7Wnd')
for h in headers:
print(h.text)
Refer the output... Is this what you are expecting?
I am new to python and web scraping. I wrote some code for scraping quotes and the corresponding author name from https://www.brainyquote.com/topics/inspirational-quotes and ended with no result. Here is the code i used for the purpose,
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome(executable_path=r"C:\Users\Sandheep\Desktop\chromedriver.exe")
product = []
prices = []
driver.get("https://www.brainyquote.com/topics/inspirational-quotes")
content = driver.page_source
soup = BeautifulSoup(content, "lxml")
for a in soup.findAll("a", href=True, attrs={"class": "clearfix"}):
quote = a.find("a", href=True, attrs={"title": "view quote"}).text
author = a.find("a", href=True, attrs={"class": "bq-aut"}).text
product.append(quote)
prices.append(author)
print(product)
print(prices)
I am not getting where i need to edit to get the result.
THANKS IN ADVANCE!!!!
As I understand site has this information in attribute alt of images. Also, quote and author separated by ' - '.
So you need to iterate by soup.find_all('img'), the function to fetch result may look like:
def fetch_quotes(soup):
for img in soup.find_all('img'):
try:
quote, author = img['alt'].split(' - ')
except ValueError:
pass
else:
yield {'quote': quote, 'author': author}
Then, use it like: print(list(fetch_quotes(soup)))
Also, note, it is often that you can replace using selenium to pure requests, e.g.:
import requests
from bs4 import BeautifulSoup
content = requests.get("https://www.brainyquote.com/topics/inspirational-quotes").content
soup = BeautifulSoup(content, "lxml")
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome(executable_path=r"ChromeDriver path")
driver.get("https://www.brainyquote.com/topics/inspirational-quotes")
content = driver.page_source
soup = BeautifulSoup(content, "lxml")
root_tag=["div", {"class":"m-brick grid-item boxy bqQt r-width"}]
quote_author=["a",{"title":"view author"}]
quote=[]
author=[]
all_data = soup.findAll(root_tag[0], root_tag[1])
for div in all_data:
try:
quote.append(div.find_all("a",{"title":"view quote"})[1].text)
author.append(div.find(quote_author[0], quote_author[1]).text)
except:
continue
The output Will be:
for i in range(len(author)):
print(quote[i])
print(author[i])
break
Start by doing what's necessary; then do what's possible; and suddenly you are doing the impossible.
Francis of Assisi
I'm using BeautifulSoup in python, to scrape a website.
While the addrs, a_earths was crawled, points = soup.select('.addr_point') at the end This section can't be crawled. I don't know the cause (the dashed red box in Image of webpage)
Following is code block I'm using:
import urllib.parse
from bs4 import BeautifulSoup
import re
url = 'http://www.dooinauction.com/auction/ca_list.php'
req = urllib.request.Request(url) #
html = urllib.request.urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')
tots = soup.select('div.title_left font') #total
tot = int(re.findall('\d+', tots[0].text)[0])
print(f'total : {tot}건')
url = f'http://www.dooinauction.com/auction/ca_list.php?total_record={tot}&search_fm_off=1&search_fm_off=1&start=0'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
addrs = soup.select('.addr') # crawling OK
a_earths = soup.select('.list_class.bold') #crawling OK
points = soup.select('.addr_point') #crawling NO
print()
Image of webpage
I browse your website and it seems that I can't see the addr_points section. I think maybe this is the reason.
Screenshot:
I need to scrap a website to obtain some information like Film's title and the relative links. My code run correctly but it stops at the first line of the website. This is my code, thank you in advance for your help and sorry if this is not a smart question but I'm a novice.
import requests
from bs4 import BeautifulSoup
URL= 'http://www.simplyscripts.com/genre/horror-scripts.html'
def scarica_pagina(URL):
page = requests.get(URL)
html = page.text
soup = BeautifulSoup(html, 'lxml') l
films = soup.find_all("div",{"id": "movie_wide"})
for film in films:
link = film.find('p').find("a").attrs['href']
title = film.find('p').find("a").text.strip('>')
print (link)
print(title)
Try the below way. I've slightly modified your script to serve the purpose and make it look better. Let me know if you encounter any further issues:
import requests
from bs4 import BeautifulSoup
URL = 'http://www.simplyscripts.com/genre/horror-scripts.html'
def scarica_pagina(link):
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
for film in soup.find(id="movie_wide").find_all("p"):
link = film.find("a")['href']
title = film.find("a").text
print (link,title)
if __name__ == '__main__':
scarica_pagina(URL)
So I'm trying to write a mediocre script to download subtitles from one particular website as y'all can see. I'm a newbie to beautifulsoup, so far I have a list of all the "href" after a search query(GET). So how do I navigate further, after getting all the links?
Here's the code:
import requests
from bs4 import BeautifulSoup
usearch = input("Movie Name? : ")
url = "https://www.yifysubtitles.com/search?q="+usearch
print(url)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
for link in soup.find_all('a'):
dictn = link.get('href')
print(dictn)
You need to use resp.text instead of resp.content
Try this to get the search results.
import requests
from bs4 import BeautifulSoup
base_url_f = "https://www.yifysubtitles.com"
search_url = base_url_f + "/search?q=last+jedi"
resp = requests.get(search_url)
soup = BeautifulSoup(resp.text, 'lxml')
for media in soup.find_all("div", {"class": "media-body"}):
print(base_url_f + media.find('a')['href'])
out: https://www.yifysubtitles.com/movie-imdb/tt2527336