How to scrape review to dataframe - python

I would like to scratch the reviews from this page and save them as a data frame, but I do not download star ratings and the text of the review. Just only text. What i did wrong?
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.morele.net/pralka-candy-cs4-1062d3-950636/?sekcja=reviews-all")
soup = BeautifulSoup(page.content, "html.parser",
).find_all("div", {"class":"reviews-item"})
# print(soup)
morele = [div.getText(strip=True) for div in soup]
print(morele)
csv_table = pd.DataFrame(morele)
csv_table = csv_table.reset_index(drop=True)
csv_table.insert(0,'No.',csv_table.index)

You are mostly there - just further navigate the DOM and you can get just the text.
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.morele.net/pralka-candy-cs4-1062d3-950636/?sekcja=reviews-all")
soup = BeautifulSoup(page.content, "html.parser",)
data = [{"text":ri.find("div", {"class":"rev-desc"}).getText(strip=True) ,
"stars":ri.find("div", {"class":"rev-stars"}).getText(strip=True)}
for ri in soup.find_all("div", {"class":"reviews-item"})
]
pd.DataFrame(data)

Related

Having trouble extracting the URL from a website

So i want to extract url for all the buttons on the sidebar, but I can't seem to get past the first one, and I dont know why or how to fix it. Unfortunately, this is for an assignment so I cant import anything else.
This is the code I tried
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
sidebar=soup.find_all('div',{'class':'side_categories'})
for a in sidebar:
genre_url=a.find('a').get('href')
print(genre_url)
I got
catalogue/category/books_1/index.html
I was expecting
catalogue/category/books_1/index.html
catalogue/category/books/travel_2/index.html
catalogue/category/books/mystery_3/index.html
catalogue/category/books/historical-fiction_4/index.html
catalogue/category/books/sequential-art_5/index.html
catalogue/category/books/classics_6/index.html
...
I used the following CSS selector to find all the tags from the sidebar: .side_categories>ul>li>ul>li>a
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
genre_url_elems = soup.select(".side_categories>ul>li>ul>li>a")
genre_urls = [e['href'] for e in genre_url_elems]
for url in genre_urls:
print(url)
Here's the output:
catalogue/category/books/travel_2/index.html
catalogue/category/books/mystery_3/index.html
catalogue/category/books/historical-fiction_4/index.html
catalogue/category/books/sequential-art_5/index.html
catalogue/category/books/classics_6/index.html
catalogue/category/books/philosophy_7/index.html
catalogue/category/books/romance_8/index.html
catalogue/category/books/womens-fiction_9/index.html
catalogue/category/books/fiction_10/index.html
catalogue/category/books/childrens_11/index.html
catalogue/category/books/religion_12/index.html
catalogue/category/books/nonfiction_13/index.html
catalogue/category/books/music_14/index.html
catalogue/category/books/default_15/index.html
catalogue/category/books/science-fiction_16/index.html
catalogue/category/books/sports-and-games_17/index.html
catalogue/category/books/add-a-comment_18/index.html
catalogue/category/books/fantasy_19/index.html
catalogue/category/books/new-adult_20/index.html
catalogue/category/books/young-adult_21/index.html
catalogue/category/books/science_22/index.html
catalogue/category/books/poetry_23/index.html
catalogue/category/books/paranormal_24/index.html
catalogue/category/books/art_25/index.html
catalogue/category/books/psychology_26/index.html
catalogue/category/books/autobiography_27/index.html
catalogue/category/books/parenting_28/index.html
catalogue/category/books/adult-fiction_29/index.html
catalogue/category/books/humor_30/index.html
catalogue/category/books/horror_31/index.html
catalogue/category/books/history_32/index.html
catalogue/category/books/food-and-drink_33/index.html
catalogue/category/books/christian-fiction_34/index.html
catalogue/category/books/business_35/index.html
catalogue/category/books/biography_36/index.html
catalogue/category/books/thriller_37/index.html
catalogue/category/books/contemporary_38/index.html
catalogue/category/books/spirituality_39/index.html
catalogue/category/books/academic_40/index.html
catalogue/category/books/self-help_41/index.html
catalogue/category/books/historical_42/index.html
catalogue/category/books/christian_43/index.html
catalogue/category/books/suspense_44/index.html
catalogue/category/books/short-stories_45/index.html
catalogue/category/books/novels_46/index.html
catalogue/category/books/health_47/index.html
catalogue/category/books/politics_48/index.html
catalogue/category/books/cultural_49/index.html
catalogue/category/books/erotica_50/index.html
catalogue/category/books/crime_51/index.html
For more, read about 'CSS selectors': https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors
Here you go:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
# sidebar=soup.find_all('div',{'class':'side_categories'})
sidebar=soup.find_all('a',href=True)
for link in sidebar:
url = link['href']
if 'catalogue' in url:
print(url)

find all a href from table

I'm trying to scrape rotten tomatoes with bs4
My aim is to find all a hrefs from the table but i cannot do it can you help me?
https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/
my code is
from urllib import request
from bs4 import BeautifulSoup as BS
import re
import pandas as pd
url = 'https://www.rottentomatoes.com/top/bestofrt'
html = request.urlopen(url)
bs = BS(html.read(), 'html.parser')
tags = bs.find_all('a', {'class':'articleLink unstyled'})[7:]
links = ['https://www.rottentomatoes.com' + tag['href'] for tag in tags]
########################################### links ############################################################################
webpages = []
for link in reversed(links):
print(link)
html = request.urlopen(link)
bs = BS(html.read(), 'html.parser')
tags = bs.find_all('a', {'class':'unstyled articleLink'})[43:]
links = ['https://www.rottentomatoes.com' + tag['href'] for tag in tags]
webpages.extend(links)
print(webpages)
I put a limit to 43 in order to avoid useless links except for movies but it is a short term solution and does not help
I need to find an exact solution on how to scrape from table without scrape irrelevant information
thanks
Just grab the main table and then extract all the <a> tags.
For example:
import requests
from bs4 import BeautifulSoup
rotten_tomatoes_url = 'https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/'
action_and_adventure = [
f"https://www.rottentomatoes.com{link.get('href')}"
for link in
BeautifulSoup(
requests.get(rotten_tomatoes_url).text,
"lxml",
)
.find("table", class_="table")
.find_all("a")
]
print(len(action_and_adventure))
print("\n".join(action_and_adventure[:10]))
Output (all 100 links to movies):
100
https://www.rottentomatoes.com/m/black_panther_2018
https://www.rottentomatoes.com/m/avengers_endgame
https://www.rottentomatoes.com/m/mission_impossible_fallout
https://www.rottentomatoes.com/m/mad_max_fury_road
https://www.rottentomatoes.com/m/spider_man_into_the_spider_verse
https://www.rottentomatoes.com/m/wonder_woman_2017
https://www.rottentomatoes.com/m/logan_2017
https://www.rottentomatoes.com/m/coco_2017
https://www.rottentomatoes.com/m/dunkirk_2017
https://www.rottentomatoes.com/m/star_wars_the_last_jedi
try this:
tags = bs.find_all(name='a', {'class':'unstyled articleLink'})[43:]

How scrape comment by soup

I would like scrape comments from this site https://www.ceneo.pl/sklepy/morele.net-s379
But after scrpaed i got empty file. What i did wrong ?
This is my code
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.ceneo.pl/sklepy/morele.net-s379")
soup = BeautifulSoup(page.content, "html.parser",
).find_all("div", class_="js_shop-reviews js_shop reviews-offer")
morele = [[ i.getText(strip=True) for i in div.find("div") if i.getText()] for div in soup]
csv_table = pd.DataFrame(morele)
csv_table = csv_table.reset_index(drop=True)
csv_table.insert(0,'No.',csv_table.index)
#print(csv_table)
#Export to Csv file
csv_table.to_csv(r'C:/Users/admin/Desktop/morele.csv',";",encoding='utf-8-sig',index = False, header=True)
try this
I found the comment was under class user-post__text so changed it.
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.ceneo.pl/sklepy/morele.net-s379")
soup = BeautifulSoup(page.content, "html.parser",
).find_all("div", {"class":"user-post__text"}) #changes made here
# print(soup)
morele = [div.getText(strip=True) for div in soup] #and here as well
print(morele)
csv_table = pd.DataFrame(morele)
csv_table = csv_table.reset_index(drop=True)
csv_table.insert(0,'No.',csv_table.index)
#print(csv_table)
#Export to Csv file
csv_table.to_csv(r'morele.csv',";",encoding='utf-8-sig',index = False, header=True)
Does this solves your problem?

Python html parsing using beautiful soup issues

I am trying to get the name of all organizations from https://www.devex.com/organizations/search using beautifulsoup.However, I am getting an error. Can someone please help.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
headers = {"Accept-Language": "en-US,en;q=0.5"}
titles = []
pages = np.arange(1, 2, 1)
for page in pages:
page = requests.get("https://www.devex.com/organizations/search?page%5Bnumber%5D=" + str(page) + "", headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
movie_div = soup.find_all('div', class_='info-container')
sleep(randint(2,10))
for container in movie_div:
name = container.a.find('h3', class_= 'ng-binding').text
titles.append(name)
movies = pd.DataFrame({
'movie': titles,
})
to see your dataframe
print(movies)
to see the datatypes of your columns
print(movies.dtypes)
to see where you're missing data and how much data is missing
print(movies.isnull().sum())
to move all your scraped data to a CSV file
movies.to_csv('movies.csv')
you may try with something like
name = bs.find("h3", {"class": "ng-binding"})

How to specify table for BeautifulSoup to find?

I'm trying to grab the table on this page https://nces.ed.gov/collegenavigator/?id=139755 under the Net Price expandable object. I've gone through tutorials for BS4, but I get so confused by the complexity of the html in this case that I can't figure out what syntax and which tags to use.
Here's a screenshot of the table and html I'm trying to get:
This is what I have so far. How do I add other tags to narrow down the results to just that one table?
import requests
from bs4 import BeautifulSoup
page = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = BeautifulSoup(page.text, 'html.parser')
soup = soup.find(id="divctl00_cphCollegeNavBody_ucInstitutionMain_ctl02")
print(soup.prettify())
Once I can parse that data, I will format into a dataframe with pandas.
In this case I'd probably just use pandas to retrieve all tables then index in for appropriate
import pandas as pd
table = pd.read_html('https://nces.ed.gov/collegenavigator/?id=139755')[10]
print(table)
If you are worried about future ordering you could loop the tables returned by read_html and test for presence of a unique string to identify table or use bs4 functionality of :has , :contains (bs4 4.7.1+) to identify the right table to then pass to read_html or continue handling with bs4
import pandas as pd
from bs4 import BeautifulSoup as bs
r = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = bs(r.content, 'lxml')
table = pd.read_html(str(soup.select_one('table:has(td:contains("Average net price"))')))
print(table)
ok , maybe this can help you , I add pandas
import requests
from bs4 import BeautifulSoup
import pandas as pd
page = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = BeautifulSoup(page.text, 'html.parser')
div = soup.find("div", {"id": "divctl00_cphCollegeNavBody_ucInstitutionMain_ctl02"})
table = div.findAll("table", {"class": "tabular"})[1]
l = []
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
if td:
row = [i.text for i in td]
l.append(row)
df=pd.DataFrame(l, columns=["AVERAGE NET PRICE BY INCOME","2015-2016","2016-2017","2017-2018"])
print(df)
Here is a basic script to scrape that first table in that accordion:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://nces.ed.gov/collegenavigator/?id=139755#netprc"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
parent_table = soup.find('div', attrs={'id':'netprc'})
desired_table = parent_table.find('table')
print(desired_table.prettify())
I assume you only want the values within the table so I did an overkill version of this as well that will combine the column names and values together:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://nces.ed.gov/collegenavigator/?id=139755#netprc"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
parent_table = soup.find('div', attrs={'id':'netprc'})
desired_table = parent_table.find('table')
header_row = desired_table.find_all('th')
headers = []
for header in header_row:
header_text = header.get_text()
headers.append(header_text)
money_values = []
data_row =desired_table.find_all('td')
for rows in data_row:
row_text = rows.get_text()
money_values.append(row_text)
for yrs,money in zip(headers,money_values):
print(yrs,money)
This will print out the following:
Average net price
2015-2016 $13,340
2016-2017 $15,873
2017-2018 $16,950

Categories

Resources