Scraping reviews from tripadvisor - python

Suppose I am scraping a reviews from the url
https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html
It contents no of pages which contains the reviews which I want to scrape. So how can I scrape the reviews of all the next pages.
I used the below code but still shows only the reviews in first page only!
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html"
MAX_PAGES = 30
counter = 0
for i in range(1, MAX_PAGES):
if i > 1:
url = "%spage/%d/" % (URL_BASE, i)
else:
url = URL_BASE
req = requests.get(url)
statusCode = req.status_code
if statusCode == 200:
html = BeautifulSoup(req.text, "html.parser")
resultsoup = html.find_all('P', {'class': 'partial_entry'})
else:
break
for review in resultsoup:
review_list = review.get_text()
print(review_list)

Based on example for scrapy.
Server adds to url (in any place before .html)
-or5 to get second page,
-or10 to get third page,
etc.
You could even skip words (which are for SEO) and use only
https://www.tripadvisor.com/g562819-d289642-or5.html
https://www.tripadvisor.com/g562819-d289642-or10.html
to get next pages with reviews.
from bs4 import BeautifulSoup
import requests
import re
#import webbrowser
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
num_reviews = response.find('span', class_='reviews_header_count').text
num_reviews = num_reviews[1:-1] # remove `( )`
num_reviews = num_reviews.replace(',', '') # remove `,`
num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('.html', '-or{}.html')
print('template:', url)
# load pages with reviews
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
# get every review
for idx, review in enumerate(response.find_all('div', class_='review-container')):
item = {
'hotel_name': response.find('h1', class_='heading_title').text,
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
'num_reviews_reviewer': review.find('span', class_='badgetext').text,
'reviewer_name': review.find('span', class_='scrname').text,
'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}
results.append(item) # <--- add to global list
#~ yield item
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
# --- main ---
s = requests.Session()
start_urls = [
'https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html',
]
results = [] # <--- global list for items
for url in start_urls:
parse(url, get_soup(url))
import pandas as pd
df = pd.DataFrame(results) # <--- convert list to DataFrame
df.to_csv('output.csv') # <--- save in file

Related

Trying to scrape other category with beautifulsoup

this is the website i am trying to scrape:
[https://www.jurongpoint.com.sg/store-directory/]
This is my code,as u can see i don't know how to fill both of the {} for the url variable as the 4 category that i want to scrape especially url for service is very different. The comment above url variable shows the link of the 4 category when clicked in. Appreciate any help,thank you!
from bs4 import BeautifulSoup
import requests
def parse():
cate=["Service","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
#cate=Food+%26+Beverage
#cate=Electronics+%26+Technology
#cate=Fashion+%26+Accessories
#cate=Services
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate={}+%26+{}"
for cat in cate:
for page in range(1,14):
print(page)
soup = BeautifulSoup(requests.get(url).text ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Use parameters of your request and avoid to manage escape characters (like %26)
url = "https://www.jurongpoint.com.sg/store-directory"
for cat in cate:
for page in range(1, 14):
print(f'Scraping category {cat} page {page}')
payload = {
'level': '',
'cate': cat,
'page': page
}
resp = requests.get(url, params=payload)
soup = BeautifulSoup(resp.text, 'html.parser')
# your code here
>>> resp.url
'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Electronics+%26+Technology&page=8'

Loop when scraping Tripadvisor with BeautifulSoup

I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
The problem is that it continues looping. Once it has parsed all reviews, it goes back to the most recent ones and starts over - therefore, it doesn't produce the .csv with all reviews.
The funniest part is that I am only encountering issues with some venues (for example) and not others (such as this one)
I'm attaching the code here in case someone can help me.
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', class_='heading').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')

ValueError when scraping Tripadvisor for reviews with BeautifulSoup

I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)
I'm receiving the error:
Traceback (most recent call last):
File "<pyshell#37>", line 4, in <module>
items = scrape(url)
File "<pyshell#13>", line 11, in scrape
items = parse(session, url + '?filterLang=' + lang)
File "<pyshell#18>", line 15, in parse
num_reviews = int(num_reviews) # convert text into integer
ValueError: invalid literal for int() with base 10: '5.695'
(where 5,695 is the number of reviews in the page)
I'm attaching the code here in case someone can help me.
Thank you so much!
Silvia
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
Thanks to all the commenters. I realized the issue lied in the Italian and US paradigm for writing thousand separators (we use ".", whereas the americans use ",").
You seem to have the following string for number of views 5.695 before trying to type cast it to int with num_reviews = int(num_reviews).
Probably the . in 5.695 is a thousands separator.
So remove the . like this before using int():
num_reviews = num_reviews.replace('.', '')
num_reviews = int(num_reviews)
The error is due to the full stop in the int you are trying to convert. To make sure it works with all typing formats, you need to filter for numerical characters only before converting to int:
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '').replace('.','')
num_reviews = int(num_reviews)
Or more in a more generic way, only include numerical chars in the string num_reviews
You cannot be parsed directly to an integer value, In this case you first convert it into float then if you want convert it as Int.
num_reviews = int(float(num_reviews))

How to grab next page url from pagination.

unable to fetch the url for next page. Throws traceback error. Basically i want to grab "/browse-movies?page=2"
from bs4 import BeautifulSoup
import requests
import re
url = "https://yts.ag/browse-movies?page=1"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('ul', 'tsc_pagination')[0]
for item in items:
print item
You could use range(1, 300) to iterate all pages:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "https://yts.ag/browse-movies?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'browse-movie-wrap')
for item in items:
for val in item.find_all('div','browse-movie-bottom'):
title = item.find_all('a','browse-movie-title')[0].text
year = item.find_all('div','browse-movie-year')[0].text
for val in item.find_all('a','browse-movie-link'):
try:
rating = val.find_all('h4')[0].text
genre = val.find_all('h4')[1].text
except:
pass
print year, rating, genre, title
P.S. You might want to add time.sleep(1) to slow down a little bit in case they block your IP for being too aggressive scraping their webpages.
Edit:
Now look for the next page URL, you could use regular expression:
import re
next_page = soup.find('a', text=re.compile(r'.*Next.*'))
print next_page['href']
So what it does is to look for an a tag which has content matches regular expression '.*Next.*'.
urls = ["https://yts.ag/browse-movies?page={}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
# your code here
print year, rating, genre, title
Make a URL list and iterate over it. You can change the range.

Getting a list of Urls and then finding specific text from all of them in Python 3.5.1

So I have this code that will give me the urls I need in a list format
import requests
from bs4 import BeautifulSoup
offset = 0
links = []
with requests.Session() as session:
while True:
r = session.get("http://rayleighev.deviantart.com/gallery/44021661/Reddit?offset=%d" % offset)
soup = BeautifulSoup(r.content, "html.parser")
new_links = soup.find_all("a", {'class' : "thumb"})
# no more links - break the loop
if not new_links:
break
# denotes the number of gallery pages gone through at one time (# of pages times 24 equals the number below)
links.extend(new_links)
print(len(links))
offset += 24
#denotes the number of gallery pages(# of pages times 24 equals the number below)
if offset == 48:
break
for link in links:
print(link.get("href"))
After that I try to get different text from all of the urls, and all that text is in relatively the same place on each one. But, whenever I run the second half, below, I keep getting a chunk of html text and some errors, and I'm not sure of how to fix it or if there is any other, and preferably simpler, way to get the text from each url.
import urllib.request
import re
for link in links:
url = print("%s" % link)
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
paragraphs = re.findall(r'</a><br /><br />(.*?)</div>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span></div>', str(respData))
for eachP in paragraphs:
print(eachP)
title = re.findall(r'<title>(.*?)</title>', str(respData))
for eachT in title:
print(eachT)
Your code:
for link in links:
url = print("%s" % link)
assigns None to url. Perhaps you mean:
for link in links:
url = "%s" % link.get("href")
There's also no reason to use urllib to get the sites content, you can use requests as you did before by changing:
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
to
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
Now you can get the title and paragraph with just:
title = soup.find('div', {'class': 'dev-title-container'}).h1.text
paragraph = soup.find('div', {'class': 'text block'}).text

Categories

Resources