Unable to scrape string and list at the same time

Unable to scrape string and list at the same time - python

I'm trying to get name, address and key contacts from a webpage using a python script. I can get them individually in the right way. However, what I wish to do is get the name and address as string and the key contacts in a list so that I can write them in a csv file in 6 columns. I can't find any way to include the value of data-cfemail within the list of contacts.
Website address
I've tried with:
import requests
from bs4 import BeautifulSoup
link = "https://www.fis.com/fis/companies/details.asp?l=e&filterby=species&specie_id=615&page=1&company_id=160574&country_id="
res = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,'lxml')
name = soup.select_one("#name").text.strip()
address = soup.select("#description_details tr:contains('Address:') td")[1].text
contacts = [' '.join(item.get_text(strip=True).split()) for item in soup.select("#contacts table tr td")]
print(name,address,contacts)
Current output:
Bahia Grande S.A. - BG Group
MaipÃº 1252 Piso 8Â°
['Founder & PresidentMr Guillermo Jacob', 'VP FinanceMr Andres Jacob[email protected]', 'ControllerMr Juan Carlos Peralta[email protected]', 'VP AdmnistrationMs Veronica Vinuela[email protected]', '']
Expected output (as the emails are protected the value of data-cfemail will do):
Bahia Grande S.A. - BG Group
MaipÃº 1252 Piso 8Â°
[Founder & President, Mr Guillermo Jacob]
[VP Finance, Mr Andres Jacob,bbdad1dad8d4d9fbd9dad3d2dadcc9dad5dfde95d8d4d695dac9]
[Controller,Mr Juan Carlos Peralta,0b61687b6e796a677f6a4b696a63626a6c796a656f6e25686466256a79]
[VP Admnistration,Ms Veronica Vinuela,87f1f1eee9f2e2ebe6c7e5e6efeee6e0f5e6e9e3e2a9e4e8eaa9e6f5]

You could do it the following way restricting to the appropriate tds #contacts td[height] then the appropriate ids
td.select('#contacts_title, #contacts_name, #contacts_email') then testing in a list comprehension if current has the cfemail and acting accordingly.
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.fis.com/fis/companies/details.asp?l=e&filterby=species&specie_id=615&page=1&company_id=160574&country_id=')
soup = bs(r.content, 'lxml')
name = soup.select_one('#name').text.strip()
address = soup.select_one('#description_details td:contains("Address:") + td div').text
print(name)
print(address)
for td in soup.select('#contacts td[height]'):
print([i.text.strip().replace('\xa0',' ') if i.select_one('.__cf_email__') is None else i.select_one('.__cf_email__')['data-cfemail']
for i in td.select('#contacts_title, #contacts_name, #contacts_email')])
OP's implementation:
contacts = [', '.join([i.text.strip().replace('\xa0',' ') if i.select_one('.__cf_email__') is None else i.select_one('.__cf_email__')['data-cfemail'] for i in td.select('#contacts_title, #contacts_name, #contacts_email')]) for td in soup.select('#contacts td[height]')]

You can iterate over the table storing the contact information:
import requests
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://www.fis.com/fis/companies/details.asp?l=e&filterby=species&specie_id=615&page=1&company_id=160574&country_id=').text, 'html.parser')
title, address = d.find('div', {'id':'name'}).text, d.find('div', {'id':'description_details'}).tr.div.text
contacts = [[i.find_all('div') for i in b.find_all('td')] for b in d.find('div', {'id':'contacts'}).table.find_all('tr')]
result = [[j.get_text(strip=True) if j.a is None else j.a.span['data-cfemail'] for j in i] for b in contacts for i in b if i]
Output:
'\xa0Bahia Grande S.A. - BG Group' #title
'MaipÃº 1252 Piso 8Â°' #address
[['Founder & President', 'Mr\xa0Guillermo\xa0Jacob'], ['VP Finance', 'Mr\xa0Andres\xa0Jacob', 'e6878c87858984a684878e8f87819487888283c885898bc88794'], ['Controller', 'Mr\xa0Juan Carlos\xa0Peralta', '264c45564354474a52476644474e4f474154474842430845494b084754'], ['VP Admnistration', 'Ms\xa0Veronica\xa0Vinuela', 'baccccd3d4cfdfd6dbfad8dbd2d3dbddc8dbd4dedf94d9d5d794dbc8']] #contact info

Related

Python University Names and Abbrevations and Weblink

I want to prepare a dataframe of universities, its abbrevations and website link.
My code:
abb_url = 'https://en.wikipedia.org/wiki/List_of_colloquial_names_for_universities_and_colleges_in_the_United_States'
abb_html = requests.get(abb_url).content
abb_df_list = pd.read_html(abb_html)
Present answer:
ValueError: No tables found
Expected answer:
df =
| | university_full_name | uni_abb | uni_url|
---------------------------------------------------------------------
| 0 | Albert Einstein College of Medicine | AECOM | https://en.wikipedia.org/wiki/Albert_Einstein_College_of_Medicine|

That's one funky page you have there...
First, there are indeed no tables in there. Second, some organizations don't have links, others have redirect links and still others use the same abbreviation for more than one organization.
So you need to bring in the heavy artillery: xpath...
import pandas as pd
import requests
from lxml import html as lh
url = "https://en.wikipedia.org/wiki/List_of_colloquial_names_for_universities_and_colleges_in_the_United_States"
response = requests.get(url)
doc = lh.fromstring(response.text)
rows = []
for uni in doc.xpath('//h2[./span[#class="mw-headline"]]//following-sibling::ul//li'):
info = uni.text.split(' – ')
abb = info[0]
#for those w/ no links
if not uni.xpath('.//a'):
rows.append((abb," ",info[1]))
#now to account for those using the same abbreviation for multiple teams
for a in uni.xpath('.//a'):
dat = a.xpath('./#*')
#for those with redirects
if len(dat)==3:
del dat[1]
link = f"https://en.wikipedia.org{dat[0]}"
rows.append((abb,link,dat[1]))
#and now, at last, to the dataframe
cols = ['abb','url','full name']
df = pd.DataFrame(rows,columns=cols)
df
Output:
abb url full name
0 AECOM https://en.wikipedia.org/wiki/Albert_Einstein_... Albert Einstein College of Medicine
1 AFA https://en.wikipedia.org/wiki/United_States_Ai... United States Air Force Academy
etc.
Note: you can rearrange the order of columns in the dataframe, if you are so inclined.

Select and iterate only the expected <li> and extract its information, but be aware there is a university without an <a> (SUI – State University of Iowa), so this should be handled with if-statement in example:
for e in soup.select('h2 + ul li'):
data.append({
'abb':e.text.split('-')[0],
'full_name':e.text.split('-')[-1],
'url':'https://en.wikipedia.org' + e.a.get('href') if e.a else None
})
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://en.wikipedia.org/wiki/List_of_colloquial_names_for_universities_and_colleges_in_the_United_States"
response = requests.get(url)
soup = BeautifulSoup(response.text)
data = []
for e in soup.select('h2 + ul li'):
data.append({
'abb':e.text.split('-')[0],
'full_name':e.text.split('-')[-1],
'url':'https://en.wikipedia.org' + e.a.get('href') if e.a else None
})
pd.DataFrame(data)
Output:
abb
full_name
url
0
AECOM
Albert Einstein College of Medicine
https://en.wikipedia.org/wiki/Albert_Einstein_College_of_Medicine
1
AFA
United States Air Force Academy
https://en.wikipedia.org/wiki/United_States_Air_Force_Academy
2
Annapolis
U.S. Naval Academy
https://en.wikipedia.org/wiki/United_States_Naval_Academy
3
A&M
Texas A&M University, but also others; see A&M
https://en.wikipedia.org/wiki/Texas_A%26M_University
4
A&M-CC or A&M-Corpus Christi
Corpus Christi
https://en.wikipedia.org/wiki/Texas_A%26M_University%E2%80%93Corpus_Christi
...

There are no tables on this page, but lists. So the goal will be to go through the <ul> and then <li> tags, skipping the paragraphs you are not interested in (the first and those after the 26th).
You can extract aab_code of the university this way:
uni_abb = li.text.strip().replace(' - ', ' - ').replace(' - ', ' - ').split(' - ')[0]
while to get the url you have to access the 'href' and 'title' parameter inside the <a> tag:
for a in li.find_all('a', href=True):
title = a['title']
url= f"https://en.wikipedia.org/{a['href']}"
Accumulate the extracted information into a list, and finally create the dataframe by assigning appropriate column names.
Here is the complete code, in which I use BeautifulSoup:
import requests
import pandas as pd
from bs4 import BeautifulSoup
abb_url = 'https://en.wikipedia.org/wiki/List_of_colloquial_names_for_universities_and_colleges_in_the_United_States'
abb_html = requests.get(abb_url).content
soup = BeautifulSoup(abb_html)
l = []
for ul in soup.find_all("ul")[1:26]:
for li in ul.find_all("li"):
uni_abb = li.text.strip().replace(' - ', ' – ').replace(' — ', ' – ').split(' – ')[0]
for a in li.find_all('a', href=True):
l.append((a['title'], uni_abb, f"https://en.wikipedia.org/{a['href']}"))
df = pd.DataFrame(l, columns=['university_full_name', 'uni_abb', 'uni_url'])
Result:
university_full_name uni_abb uni_url
0 Albert Einstein College of Medicine AECOM https://en.wikipedia.org//wiki/Albert_Einstein...
1 United States Air Force Academy AFA https://en.wikipedia.org//wiki/United_States_A...

How to scrape address (comma separated text) using p tag - BeautifulSoup in python

I am trying to scrape address from the below link:
https://www.yelp.com/biz/rollin-phatties-houston
But I am getting only the first value of the address (i.e.: 1731 Westheimer Rd) out of complete address which is separated by a comma:
1731 Westheimer Rd, Houston, TX 77098
Can anyone help me out with this, please find my code below:
import bs4 as bs
import urllib.request as url
source = url.urlopen('https://www.yelp.com/biz/rollin-phatties-houston')
soup = bs.BeautifulSoup(source, 'html.parser')
mains = soup.find_all("div", {"class": "secondaryAttributes__09f24__3db5x arrange-unit__09f24__1gZC1 border-color--default__09f24__R1nRO"})
main = mains[0] #First item of mains
address = []
for main in mains:
try:
address.append(main.address.find("p").text)
except:
address.append("")
print(address)
# 1731 Westheimer Rd
I want the result from the p tag and below are the two p tag details:
Note: I want to append the address in a list.
<p class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--subtle__373c0__3DZpi text-align--left__373c0__2XGa-">1731 Westheimer Rd, Houston, TX 77098</p>
<p class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-weight--semibold__373c0__2l0fe text-size--large__373c0__3t60B"><a class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--blue-dark__373c0__85-Nu link-size--inherit__373c0__1VFlE" href="/map/rollin-phatties-houston" target="" name="" rel="" role="link">Get Directions</a><p class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--subtle__373c0__3DZpi text-align--left__373c0__2XGa-">1731 Westheimer Rd, Houston, TX 77098</p></p>
For more details, you can refer to the link: https://www.yelp.com/biz/rollin-phatties-houston

The page page is loaded dynamically, therefore urllib.request doesn't support it. However, the data is available in JSON format on the page, which you can search for using the built-in re module, and extract it using the built-in json module.
import re
import json
import bs4 as bs
import urllib.request as url
source = url.urlopen("https://www.yelp.com/biz/rollin-phatties-houston")
soup = bs.BeautifulSoup(source, "html.parser")
data = soup.select_one(
"#wrap > div.main-content-wrap.main-content-wrap--full > yelp-react-root > script"
).string
json_data = json.loads(re.search(r"({.*})", data).group(1))
print(json_data["bizDetailsPageProps"]["bizContactInfoProps"]["businessAddress"])
Output:
1731 Westheimer Rd, Houston, TX 77098

Unable to make my script produce a particular output

I'm trying to create a script in Python to fetch all the links connected to the name of different actors from imdb.com and then parse the first three of their movie links and finally scrape the name of director and writer of those movies. There are around 1000 names in there. I'm okay with the first three names for this example.
Website link
I can scrape the links of different actors and their first three movie links in one go.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://www.imdb.com/list/ls058011111/'
base = 'https://www.imdb.com/'
def get_actor_list(s):
res = s.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
for name_links in soup.select(".mode-detail")[:3]:
name = name_links.select_one("h3 > a").get_text(strip=True)
item_link = urljoin(base,name_links.select_one("h3 > a").get("href"))
yield from get_movie_links(s,name,item_link)
def get_movie_links(s,name,link):
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
item_links = [urljoin(base,item.get("href")) for item in soup.select(".filmo-category-section .filmo-row > b > a[href]")[:3]]
yield name,item_links
if __name__ == '__main__':
with requests.Session() as s:
for elem in get_actor_list(s):
print(elem)
The result I get:
('Robert De Niro', ['https://www.imdb.com/title/tt4075436/', 'https://www.imdb.com/title/tt3143812/', 'https://www.imdb.com/title/tt5537002/'])
('Jack Nicholson', ['https://www.imdb.com/title/tt1341188/', 'https://www.imdb.com/title/tt1356864/', 'https://www.imdb.com/title/tt0825232/'])
('Marlon Brando', ['https://www.imdb.com/title/tt10905860/', 'https://www.imdb.com/title/tt0442674/', 'https://www.imdb.com/title/tt1667880/'])
I can even parse the name of directors and the writers of those linked movies if I individually use those links within the following function:
def get_content(s,url):
res = s.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
director = soup.select_one("h4:contains('Director') ~ a")
director = director.get_text(strip=True) if director else None
writer = soup.select_one("h4:contains('Writer') ~ a").get_text(strip=True)
print(director,writer)
However, I would like to rectify the script merging those functions in such a way so that it produces the following (final) output:
('Robert De Niro', [Jonathan Jakubowicz, Jonathan Jakubowicz, None, Anthony Thorne, Martin Scorsese, David Grann])
('Jack Nicholson', [James L. Brooks, James L. Brooks, Casey Affleck, Casey Affleck, Rob Reiner, Justin Zackham])
('Marlon Brando', [Bob Bendetson, Bob Bendetson, Peter Mitchell, Rubin Mario Puzo, Paul Hunter, Paul Hunter])
How can I get the final output merging the above functions in the right way?

Find .nextsibling with Beautifulsoup4

I am trying to get (some) contents of a table from an URL.
So far I have managed to get two desired contents of the page, but there is a third one (third column) that I would like to get only its text. The problem is, the underlying link exists elsewhere on the page (with different text) and if I want to load the table into an SQL database, the contents of the third colum won't match the first two columns.
import urllib2
from bs4 import BeautifulSoup4
startURL = "http://some.url/website.html"
page = urllib2.urlopen(startURL).read()
soup = BeautifulSoup(page, "html.parser")
for links in soup.findAll("a"):
if "href" in links.attrs:
www = links.attrs.values()
if not "https://" in www[0]: # to exclude all non-relative links, e.g. external links
if "view/" in www[0]: # To get only my desired links of column 1
link_of_column1 = www[0] # this is now my wanted link
Okay, so with this code I can get the second column. Where and how would I have to apply the .nextsibling() function, to get the next link in the next (3rd) column?
Edit:
As I have been asked: The URL is https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html and I want to get the contents from Column 2 and 3, which is "Hosting Company" (link-text and link) and "Country" (only the text).
Edit2:
Another thing I forgot...how can I extract the information that its 137,157 records?

First find the table which has all the info using its id=web_hosting_tbl attribute. Then iterate over all the rows of the table. But, if you look at the page source, the rows you need are not consecutive, but, alternate, and they don't have any class names. Also, the first row of the table is the header row, so we've to skip that.
After getting the required rows (using table.find_all('tr')[1::2]), find all the columns and then get the required information from the corresponding columns.
Code:
import requests
from bs4 import BeautifulSoup
r = requests.get('https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html')
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table', id='web_hosting_tbl')
for row in table.find_all('tr')[1::2]:
all_columns = row.find_all('td')
name = all_columns[1].a.text
link = all_columns[1].a['href']
country = all_columns[2].a.text
print(name, link, country, sep=' | ')
Partial output:
Godaddy.com, LLC | /view/web_hosting/2433/Godaddy_com_LLC.html | USA
Cloudflare, Inc | /view/web_hosting/4638/Cloudflare_Inc.html | USA
Amazon.com, Inc | /view/web_hosting/615/Amazon_com_Inc.html | USA
Ovh Sas | /view/web_hosting/7593/Ovh_Sas.html | France
Hetzner Online Ag | /view/web_hosting/45081/Hetzner_Online_Ag.html | Germany
Hostgator.com Llc | /view/web_hosting/26757/Hostgator_com_Llc.html | USA
Google Inc | /view/web_hosting/617/Google_Inc.html | USA
Bluehost Inc | /view/web_hosting/3886/Bluehost_Inc.html | USA
...

Code: (Python 3.6+, used f-strings)
import urllib.parse
from collections import namedtuple
from datetime import datetime
import bs4
import requests
HostingCompany = namedtuple('HostingCompany',
('name', 'country', 'websites', 'usage', 'usage_by_top', 'update_time'))
class MyIpLink:
url_base = 'https://myip.ms'
def __init__(self, tag: bs4.element.Tag, *, is_anchor=False):
a_tag = tag.find('a')
if is_anchor: # treat `tag` as an anchor tag
a_tag = tag
self.text = tag.text.strip()
self.url = urllib.parse.urljoin(self.url_base, a_tag['href'])
def __repr__(self):
return f'{self.__class__.__name__}(text={repr(self.text)}, url={repr(self.url)})'
url = 'https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html'
html = requests.get(url).text
soup = bs4.BeautifulSoup(html, 'html.parser')
rows = soup.select('#web_hosting_tbl > tbody > tr')[::2] # skips "more info" rows
companies = []
for row in rows:
tds = row.find_all('td')
name = MyIpLink(tds[1])
country = MyIpLink(tds[2])
websites = [MyIpLink(a, is_anchor=True) for a in tds[3].find_all('a')]
usage = MyIpLink(tds[4])
usage_by_top = MyIpLink(tds[5])
update_time = datetime.strptime(tds[6].text.strip(), '%d %b %Y, %H:%M')
company = HostingCompany(name, country, websites, usage, usage_by_top, update_time)
companies.append(company)
import pprint
pprint.pprint(companies)
print(companies[0].name.text)
print(companies[0].name.url)
print(companies[0].country.text)
Output:
[HostingCompany(name=MyIpLink(text='Godaddy.com, LLC', url='https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.godaddy.com', url='https://myip.ms/go.php?1229687315_ITg7Im93dCkWE0kNAhQSEh0FUeHq5Q==')], usage=MyIpLink(text='512,701 sites', url='https://myip.ms/browse/sites/1/ownerID/2433/ownerIDii/2433'), usage_by_top=MyIpLink(text='951 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/2433/ownerIDii/2433'), update_time=datetime.datetime(2018, 5, 2, 5, 17)),
HostingCompany(name=MyIpLink(text='Cloudflare, Inc', url='https://myip.ms/view/web_hosting/4638/Cloudflare_Inc.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.cloudflare.com', url='https://myip.ms/go.php?840626136_OiEsK2ROSxAdGl4QGhYJG+Tp6fnrv/f49w==')], usage=MyIpLink(text='488,119 sites', url='https://myip.ms/browse/sites/1/ownerID/4638/ownerIDii/4638'), usage_by_top=MyIpLink(text='16,160 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/4638/ownerIDii/4638'), update_time=datetime.datetime(2018, 5, 2, 5, 10)),
HostingCompany(name=MyIpLink(text='Amazon.com, Inc', url='https://myip.ms/view/web_hosting/615/Amazon_com_Inc.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.amazonaws.com', url='https://myip.ms/go.php?990446041_JyYhKGFxThMQHUMRHhcDExHj8vul7f75')], usage=MyIpLink(text='453,230 sites', url='https://myip.ms/browse/sites/1/ownerID/615/ownerIDii/615'), usage_by_top=MyIpLink(text='9,557 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/615/ownerIDii/615'), update_time=datetime.datetime(2018, 5, 2, 5, 4)),
...
]
Godaddy.com, LLC
https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html
USA
Gonna update the answer with some explanation in the evening. Cheers!

Try the below approach. It should give you the texts from column 2, the links from column 2 and again the texts from column 3 out of that table. I used lxml instead of BeautifulSoup to make it faster. Thanks.
import requests
from urllib.parse import urljoin
from lxml.html import fromstring
URL = 'https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html'
res = requests.get(URL)
root = fromstring(res.text)
for items in root.cssselect('#web_hosting_tbl tr:not(.expand-child)')[1:]:
name = items.cssselect("td.row_name a")[0].text
link = urljoin(URL,items.cssselect("td.row_name a")[0].attrib['href'])
country = items.cssselect("td a[href^='/view/best_hosting/']")[0].text
print(name, link, country)
Results:
Godaddy.com, LLC https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html USA
Cloudflare, Inc https://myip.ms/view/web_hosting/4638/Cloudflare_Inc.html USA
Amazon.com, Inc https://myip.ms/view/web_hosting/615/Amazon_com_Inc.html USA
Ovh Sas https://myip.ms/view/web_hosting/7593/Ovh_Sas.html France
Hetzner Online Ag https://myip.ms/view/web_hosting/45081/Hetzner_Online_Ag.html Germany
Hostgator.com Llc https://myip.ms/view/web_hosting/26757/Hostgator_com_Llc.html USA
Google Inc https://myip.ms/view/web_hosting/617/Google_Inc.html USA

Pulling Cuisine Type from TripAdvisor Restaurants using Python

I am currently trying to pull data from TripAdvisor Restaurant from different countries. The fields I am trying to pull is name, address, and cuisine type (chinese, steakhouse, etc.). I have successfully been able to pull name and address using my script; however, pulling cuisine type is proving pretty difficult for myself. If you take a look below, you'll find screenshots of what I am trying to pull from TripAdvisor, and my code.
What I want to pull from TripAdvisor is circled in red.
When I print my code it keeps printing 'Asian' even thought the second one should be a 'Steakhouse'.
#import libraries
import requests
from bs4 import BeautifulSoup
import csv
#loop to move into the next pages. entries are in increments of 30 per page
for i in range(0, 120, 30):
#need this here for when you want more than 30 entries pulled
while i <= range:
i = str(i)
#url format offsets the restaurants in increments of 30 after the oa
url1 = 'https://www.tripadvisor.com/Restaurants-g294217-oa' + i + '-Hong_Kong.html#EATERY_LIST_CONTENTS'
r1 = requests.get(url1)
data1 = r1.text
soup1 = BeautifulSoup(data1, "html.parser")
for link in soup1.findAll('a', {'property_title'}):
#print 'https://www.tripadvisor.com/Restaurant_Review-g294217-' + link.get('href')
restaurant_url = 'https://www.tripadvisor.com/Restaurant_Review-g294217-' + link.get('href')
#print link.string
account_name = link.string.strip()
#cuisine type pull
for link in soup1.findAll('a', {'cuisine'}):
cuisinetype = link.string.strip()
r_address = requests.get(restaurant_url)
r_addresstext = r_address.text
soup2 = BeautifulSoup(r_addresstext, "html.parser")
for restaurant_url in soup2.findAll('span', {'street-address'})[0]:
#print(restaurant_url.string)
rest_address = restaurant_url.string
rest_array = [account_name, rest_address, cuisinetype]
print rest_array
#with open('ListingsPull-HongKong.csv', 'a') as file:
#writer = csv.writer(file)
#writer.writerow([account_name, rest_address])
break

This approach is not especially elegant but might be acceptable to you. I notice that the information you want seems to be repeated under the 'Details' tab for 'Cuisine'. I've found it easier to access there this way.
>>> import requests
>>> from bs4 import BeautifulSoup
>>> restaurant_url='https://www.tripadvisor.ca/Restaurant_Review-g294217-d2399904-Reviews-Tin_Lung_Heen-Hong_Kong.html'
>>> soup2 = BeautifulSoup(requests.get(restaurant_url).text, "html.parser")
>>> street_address=soup2.find('span',{'street-address'})
>>> street_address
<span class="street-address" property="streetAddress">International Commerce Centre, 1 Austin Road West, Kowloon</span>
>>> street_address.contents[0]
'International Commerce Centre, 1 Austin Road West, Kowloon'
>>> for item in soup2.findAll('div', attrs={'class', 'title'}):
... if 'Cuisine' in item.text:
...
... item.text.strip()
... break
...
'Cuisine'
>>> content=item.findNext('div', attrs={'class', 'content'})
>>> content
<div class="content">
Chinese, Asian
</div>
>>> content.text
'\nChinese,\xa0Asian\n'
>>> content.text.strip().split('\xa0')
['Chinese,', 'Asian']

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Unable to scrape string and list at the same time - python

Related

Python University Names and Abbrevations and Weblink

How to scrape address (comma separated text) using p tag - BeautifulSoup in python

Unable to make my script produce a particular output

Find .nextsibling with Beautifulsoup4

Pulling Cuisine Type from TripAdvisor Restaurants using Python

Categories

Resources