I have a problem with scraping some basic info about movies from imdb.com. I want my program to get title and description of a movie from a given URL. The title part is doing its job, however I can't figure out how to get the description. Here's my code:
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
if not r or 'https://www.imdb.com/title' not in url:
return print('Invalid movie page!')
return r.content
if __name__ == '__main__':
# print('Input the URL:')
# link = input()
link = 'https://www.imdb.com/title/tt0111161'
data = get_data(link)
soup = bs(data, 'html.parser')
title = ' '.join(soup.find('h1').text.split()[:-1])
desc = soup.find('p', {'data-testid':"plot", 'class':"GenresAndPlot__Plot-cum89p-8 kmrpno"}).text
movie_info = {'title': title, 'description': desc}
print(movie_info)
When I run it I get an error:
Exception has occurred: AttributeError
'NoneType' object has no attribute 'text'
File "movie-scraper.py", line 18, in <module>
desc = soup.find('p', {'data-testid':"plot", 'class':"GenresAndPlot__Plot-cum89p-8 kmrpno"}).text
How do I access the description properly?
To get plot summary, change the selector to find class="plot_summary":
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
r = requests.get(url, headers={"Accept-Language": "en-US,en;q=0.5"})
if not r or "https://www.imdb.com/title" not in url:
return print("Invalid movie page!")
return r.content
if __name__ == "__main__":
link = "https://www.imdb.com/title/tt0111161"
data = get_data(link)
soup = bs(data, "html.parser")
title = " ".join(soup.find("h1").text.split()[:-1])
desc = soup.find("div", class_="plot_summary").get_text(strip=True) # <-- change this to find class="plot_summary"
movie_info = {"title": title, "description": desc}
print(movie_info)
Prints:
{'title': 'The Shawshank Redemption', 'description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.Director:Frank DarabontWriters:Stephen King(short story "Rita Hayworth and Shawshank Redemption"),Frank Darabont(screenplay)Stars:Tim Robbins,Morgan Freeman,Bob Gunton|See full cast & crew»'}
Related
this is the website i am trying to scrape:
[https://www.jurongpoint.com.sg/store-directory/]
This is my code,as u can see i don't know how to fill both of the {} for the url variable as the 4 category that i want to scrape especially url for service is very different. The comment above url variable shows the link of the 4 category when clicked in. Appreciate any help,thank you!
from bs4 import BeautifulSoup
import requests
def parse():
cate=["Service","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
#cate=Food+%26+Beverage
#cate=Electronics+%26+Technology
#cate=Fashion+%26+Accessories
#cate=Services
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate={}+%26+{}"
for cat in cate:
for page in range(1,14):
print(page)
soup = BeautifulSoup(requests.get(url).text ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Use parameters of your request and avoid to manage escape characters (like %26)
url = "https://www.jurongpoint.com.sg/store-directory"
for cat in cate:
for page in range(1, 14):
print(f'Scraping category {cat} page {page}')
payload = {
'level': '',
'cate': cat,
'page': page
}
resp = requests.get(url, params=payload)
soup = BeautifulSoup(resp.text, 'html.parser')
# your code here
>>> resp.url
'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Electronics+%26+Technology&page=8'
i need to find all title data and stock number. I wrote a code and its works well with single item when I use find_all method it shows en error please have a ook at my code and guide me how can I handle this. thank you!
here is my code:
import requests
from bs4 import BeautifulSoup
#import pandas as pd
#import numpy as
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
title = soup.find_all('div',class_="vehicle-location-name mts bold",id=False).text
print(title)
stock = soup.find_all('div',class_="text-lightgray",id=False).find('span').text
print(stock)
def main():
url = "https://www.superbrightleds.com/vehicle/2002-acura-cl-vehicle-led-lights?make=1&model=554&year=2002"
get_detail_page(get_page(url))
if __name__ == '__main__':
main()
Try:
def get_detail_page(soup):
titles = soup.findAll('div', attrs={"class": "vehicle-location-name mts bold"})
stocks = soup.findAll('div', attrs={"class": "text-lightgray"})
title = [title.get_text() for title in titles if title]
stock = [stock.get_text() for stock in stocks if stock and 'Stock #' in str(stock)]
for idx in range(len(stock)):
print(f'{title[idx]}\n\t{stock[idx]}')
I saw all the relevant previous topics about that manner and i've learned a lot (especially about the difference between lxml and html.parser)
anyway after i changed my BeautifulSoup to parse the page as lxml I still can't get all the time the same result .
most of the time i get : " name = soup.find('span', id="productTitle").text
AttributeError: 'NoneType' object has no attribute 'text' "
but in a few times i get the real name of the product.
what am I missing?
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/Homego-Bluetooth-Wrist-Smart-Handsfree/dp/B01DOULDN0/ref=sr_1_1?keywords=smart+watch&qid=1569450390&sr=8-1"
client = requests.get(url, headers={"User-Agent": "Defined"})
try:
client.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Error!!!!" + str(e))
soup = BeautifulSoup(client.content, 'lxml')
name = soup.find('span', id="productTitle").text
title = name[161:len(name)-128]
print("the title is: ", title)
if __name__ == '__main__':
main()
Presumably you are getting different html back. Check the html itself. You can add in a test of whether variables are None before attempting to access .text
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/Homego-Bluetooth-Wrist-Smart-Handsfree/dp/B01DOULDN0/ref=sr_1_1?keywords=smart+watch&qid=1569450390&sr=8-1"
client = requests.get(url, headers={"User-Agent": "Defined"})
try:
client.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Error!!!!" + str(e))
soup = BeautifulSoup(client.content, 'lxml')
name = soup.find('span', id="productTitle")
if name is None:
name = 'N/A'
title = 'n/a'
else:
name = name.text
title = name[161:len(name)-128]
print("the title is: ", title)
if __name__ == '__main__':
main()
I'm trying to develop an application which can download videos and sound tracks from Youtube, but sometimes it pops out the TypeError shown by the link after making a query like "harry potter" or "lord of the rings".
TypeError Message
And here is my python code:
from __future__ import unicode_literals
import requests
from bs4 import BeautifulSoup
import youtube_dl
def find_search_content(search):
request = requests.get("https://www.youtube.com/results?search_query={}".format(search))
content = request.content
soup = BeautifulSoup(content, "html.parser")
return soup
def find_page_content(search):
request = requests.get("https://www.youtube.com/results?{}".format(search))
content = request.content
soup = BeautifulSoup(content, "html.parser")
return soup
def find_video(soup, all_item, i=1):
for element in soup.find_all('a', {"rel": "spf-prefetch"}):
video_title = element.get('title')
video_link = element.get('href')
img_value = element.get('href').split("=")[1]
img = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(img_value)
all_item['{}'.format(i)] = {"title": video_title, "link": "https://www.youtube.com{}".format(video_link),
'img': img}
i += 1
return all_item
def video_time(soup, all_item, i=1):
for time in soup.find_all('span', {"class": "video-time"}):
all_item.get('{}'.format(i))['time'] = time.text
i += 1
return all_item
def every_video(soup):
all_item = {}
find_video(soup, all_item, i=1)
video_time(soup, all_item, i=1)
return all_item
Thank you very much!
Trying to get multiple headlines, links and dates. Only getting the first one. Not sure why BS4 won't fetch all the items...Is it a javascript problem?
from bs4 import BeautifulSoup
from urllib import urlopen
html = urlopen("http://www.fiercepharma.com/news")
soup = BeautifulSoup(html.read().decode('utf-8'),"lxml")
main_div = soup.select_one("div#content")
div_sub = main_div.select("div.region.region-content")
for d in div_sub:
date = d.time.get_text()
headline = d.h2.a.get_text()
url = d.a["href"]
print headline, url, date
What about using the following to capture all the articles that contain links, authors, posting dates on the main page. You could store this in a dictionary, or store it in a pandas dataframe for easy manipulation.
from bs4 import BeautifulSoup
import requests
baseurl = 'http://www.fiercepharma.com'
response = requests.get(baseurl)
soup = BeautifulSoup(response.content)
cdict = {}
for group in soup.find_all('div', {'class' : 'card horizontal views-row'}):
try:
title = group.find('h2', {'class' : 'field-content list-title'}).text
link = baseurl + group.find('h2', {'class' : 'field-content list-title'}).find('a', href=True)['href']
author = group.find('span', {'class' : 'field-content'}).find('a').text
time = group.find('span', {'class' : 'field-content'}).find('time').text
content = group.find('p', {'class' : 'field-content card-text'}).text
cdict[link] = {'title' : title, 'author' : author, 'time' : time, 'content' : content}
except AttributeError as e:
print('[-] Unable to parse {}'.format(e))
print(cdict)
#{'http://www.fiercepharma.com/manufacturing/lonza-bulks-up-5-5b-deal-for-capsugel': {'author': u'Eric Palmer',
# 'content': u'Swiss CDMO Lonza has pulled the trigger on a $5.5 billion deal to acquire the U.S.-based contract capsule and drug producer Capsugel to create another sizable\u2026',
# 'time': u'Dec 15, 2016 8:45am',
# 'title': u'Lonza bulks up with $5.5B deal for Capsugel'},
Both div.card.horizontal.views-row and .card.horizontal.views-row should work #citra_amarillo. I ran this and it work bothways
from bs4 import BeautifulSoup
from urllib import urlopen
html = urlopen("http://www.fiercepharma.com/news")
soup = BeautifulSoup(html.read().decode('utf-8'),"lxml")
main_div = soup.select_one("div#content")
div_sub = main_div.select(".card.horizontal.views-row")
#div_sub = main_div.select("div.card.horizontal.views-row")
for d in div_sub:
date = d.time.get_text()
headline = d.h2.a.get_text()
url = d.a["href"]
print headline, url, date