I have issue with data scraping with python

I have issue with data scraping with python - python

I wrote a code for data scraping from airbnb.com actually I want to scrape all comments with details like listing name, total revies, revies, commenter name, date, comments but my code does not execute the try part it directly goes to except part. please guide me on how can I fix this issue. thank you!
here is my code:
import requests
from bs4 import BeautifulSoup
#import pandas as pd
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find_all('span',class_="_18hrqvin",id=False).text
except:
title = 'empty'
print(title)
try:
reviews = soup.find_all('div',class_="_10za72m2",id=False).text
except:
reviews = 'empty revies'
print(reviews)
try:
total_reviews = soup.find_all('span',class_="_krjbj",id=False).text
except:
total_reviews = 'empty total revies'
print(total_reviews)
try:
total_reviews = soup.find_all('span',class_="_krjbj",id=False).text
except:
total_reviews = 'empty total revies'
print(total_reviews)
try:
commenter_name = soup.find_all('div',class_="_1p3joamp",id=False).text
except:
commenter_name = 'empty commenter_name'
print(commenter_name)
try:
comment_date = soup.find_all('span',class_="_1jlnvra2",id=False).text
except:
comment_date = 'empty comment_date'
print(comment_date)
try:
comment_date = soup.find_all('span',class_="_1jlnvra2",id=False).text
except:
comment_date = 'empty comment_date'
print(comment_date)
try:
comment = soup.find_all('div',class_="_czm8crp",id=False).text
except:
comment = 'empty comment'
print(comment)
def main():
url = "https://www.airbnb.com/rooms/34826867?source_impression_id=p3_1584615891_nVK823DKHNHuFWCQ"
get_detail_data(get_page(url))
if __name__ == '__main__':
main()

As suggested by #arcticsanto there is your get_page returns None if a bad response is found so get_detail_data fails to get BeautifulSoup so just add validation for soup in get_detail_data
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
if not soup:
return
----

Related

How to get all listings urls from main page with python web scraping

I wrote a code for web scraping, My code is ok just except two issues. From detail page, everything is ok just ISBN NO, and from main page, I need all listing URLs so that my code could scrape date from aa listings. Please guide me how can I fix this issue. Both(main page and details page )URLs are in the code. Thank you!
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('span',class_="title product-field",id=False).text
except:
title = 'empty'
print(title)
try:
writer = soup.find('a',class_="contributor-name",id=False).text
except:
writer = 'empty'
print(writer)
try:
original_price = soup.find('div',class_="original-price",id=False).find('span').text
except:
original_price = 'empty'
print(original_price)
try:
active_price = soup.find('div',class_="active-price",id=False).find('span').text
except:
active_price = 'empty'
print(active_price)
try:
img = soup.find('div',class_="image-actions image-container product-type-icon-container book",id=False).find('img').attrs['src']
except:
img = 'empty'
print(img)
try:
isbn = soup.find('div',class_="bookitem-secondary-metadata",id=False).find('li').attrs['ISBN: ']
except:
isbn = 'empty'
print(isbn)
data = {
'title' : title,
'writer' : writer,
'original_price' : original_price,
'active_price' : active_price,
'image' : img,
'isbn' : isbn
}
return data
def get_index_data(soup):
titles_link = soup.find_all('a',class_="body_link_11")
try:
inks = soup.find('div', class_="item-info",id=False).find('p').find('a').get('href')
except:
inks = "empty"
print(inks)
def main():
#detail_page_url = "https://www.kobo.com/ww/en/ebook/mum-dad-1"
mainurl = "https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q"
#get_page(url)
#get_detail_data(get_page(detail_page_url))
get_index_data(get_page(mainurl))
if __name__ == '__main__':
main()

import requests
import re
import json
from bs4 import BeautifulSoup
import csv
def Soup(content):
soup = BeautifulSoup(content, 'html.parser')
return soup
def Main(url):
r = requests.get(url)
soup = Soup(r.content)
scripts = soup.findAll("script", type="application/ld+json",
text=re.compile("data"))
prices = [span.text for span in soup.select(
"p.product-field.price span span") if span.text != "USD"]
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Title", "Writer", "Price", "ISBN", "IMG", "URL"])
for script, price in zip(scripts, prices):
script = json.loads(script.text)
title = script["data"]["name"]
author = script["data"]["author"][0]["name"]
img = f'https:{script["data"]["thumbnailUrl"]}'
isbn = script["data"]["isbn"]
url = script["data"]["url"]
writer.writerow([title, author, price, isbn, img, url])
Main("https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q")
Output: View-Online
Output Sample:

How to scrape multiple result having same tags and class

My code is accurate for single page but when I run this code for multiple records using for loop and if there are some data missing like person then (as I used index no[1] and [2] for person variable ,location, phone no and cell no but if there are something missing like person name is missing) next record will be extracted at person variable. Could you please fix this issue?
here is my code:
import requests
from bs4 import BeautifulSoup
import re
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml') # 1. html , 2. parser
return soup
def get_detail_data(soup):
#soup = BeautifulSoup(r.text, 'html.parser')
try:
title = soup.find("h1", {'class': 'sc-AykKI'}).text
except:
title = 'Empty Title'
#print(title)
try:
person = soup.find(
"span", {'class': 'Contact__Item-sc-1giw2l4-2 kBpGee'}).text.strip()
except:
person = 'Empty Person'
#print(person)
try:
addr = soup.findAll(
"span", {'class': 'Contact__Item-sc-1giw2l4-2 kBpGee'})[1].text
except:
addr = 'Empty Address'
#print(addr)
#abn = soup.find('div', class_="box__Box-sc-1u3aqjl-0 kxddET").('a').text
#print(abn)
try:
ratting = soup.find(
"div", {'class': 'Rating__RatingText-sc-1r9ytu8-1 jIdgkl'}).text
except:
ratting = 'Empty Ratting'
#print(ratting)
try:
abn = (re.search('abn\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
except:
abn = 'Empty ABN'
#print(abn)
try:
website = (re.search('website\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
except:
website = 'Empty Website'
#print(website )
try:
phone = (re.search('phone\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
except:
phone = 'Empty Phone No'
#print(phone)
try:
cell = (re.search('mobile\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
except:
cell = 'Empty Cell No'
#print(cell)
data = {
'title' : title,
'peron name' : person,
'address' : addr,
'phone no' : phone,
'cell no' : cell,
'abn no' : abn,
'website' : website
}
return data
def get_index_data(soup):
#soup = BeautifulSoup(r.text, 'html.parser')
titles = []
for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
titles.append(urls)
#print(titles)
return titles
def Main():
url = "https://hipages.com.au/connect/abcelectricservicespl/service/126298"
mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
main_titles = get_index_data(get_page(mainurl))
for title in main_titles:
data1 = get_detail_data(get_page(title))
print(data1)
Main()

You need to parse your data from the script tag rather than the spans and divs.
Try this:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from pandas import json_normalize
import json
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(url):
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
raw = res.text.split("<script> window.__INITIAL_STATE__=")[1]
raw = raw.split("</script>")[0]
data = json.loads(raw)
data = json.loads(data)
cols = ['abn', 'address', 'name', 'primary_location', 'service_area', 'state', 'suburb', 'website']
df = pd.DataFrame(data["sites"]["list"]).T
df = df[cols].reset_index(drop=True)
primary_location = json_normalize(df.primary_location[0])
df = pd.concat([df, primary_location], axis=1)
to_drop = ["primary_location", "is_primary", "suburb_seo_key", "capital_city_seo_key"]
df.drop(to_drop, axis=1, inplace=True)
return df
def get_index_data(soup):
titles = []
for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
titles.append(urls)
return titles
def Main():
mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
main_titles = get_index_data(get_page(mainurl))
final_data = []
for title in main_titles:
data = get_detail_data(title)
final_data.append(data)
return final_data
data = Main()
df = pd.concat(data).reset_index(drop=True)
display(df)
This gives you much more detailed data by the way.

How to extract data (title, email link, location) from first 10 pages using python

It is a public website containg data of compaines in Sweden. I want to extract data like (Title of companies, Email Link and Location ) from first 10 pages. I write code but it does not provide right result so please help me out. Thank you!
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = (soup.find('h1',class_="Name",id=False).text)
except:
title = 'empty'
try:
address = (soup.find('p','font', class_=False,id=False).text)
except:
address = 'No location'
try:
email = soup.select_one("a[href^='mailto:']").get("href").split("mailto:")[1]
except:
email = 'No email'
data = {
'title' : title,
'address' : address,
'email' : email
}
return data
def get_index_data(soup):
try:
titles_link = soup.find_all('div', {'class': 'Name'})
for title in titles_link:
link = soup.find('a', {'title': title.text}).get('href')
titles = ('https://www.118100.se' + link)
except:
titles_link = []
return titles
def main():
url = "https://www.118100.se/sok/foretag/?q=brf&loc=&ob=rel&p=0"
#get_page(url)
#get_detail_data(get_page(url))
products = get_index_data(get_page(url))
for product in products:
data = get_detail_data(get_page(product))
print(data)
if __name__ == '__main__':
main()

The problem is the function get_index_data. You are not returning a list, you are returning a single link.
You should convert the variable titles in a list. For example:
def get_index_data(soup):
titles = []
try:
titles_link = soup.find_all('div', {'class': 'Name'})
for title in titles_link:
link = soup.find('a', {'title': title.text}).get('href')
titles.append('https://www.118100.se' + link)
except:
return []
return titles

I am trying to parse data from all pages. Only the first page is parsed

I am trying to parse data from all pages. Parsing ends after the first page. What could be the problem?
I use pagination with the use of a regular expression.
The first page of the site and others differ in the html code, so I have to create two different functions main_1 and main_2 for the first and other pages.
If you try to run only the main_2 function, nothing will work. .CSV file will not be created.
help me please.
import requests
from bs4 import BeautifulSoup
import csv
import re
def get_html(url):
r = requests.get(url)
if r.ok:
return r.text
print(r.status_code)
def writer_csv(data):
with open('tesr.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow((data['name'], data['url'], data['price']))
def get_data_page(html):
soup = BeautifulSoup(html, 'lxml')
trs = soup.find_all('tr', class_='cmc-table-row')
for tr in trs:
tds = tr.find_all('td')
try:
name = tds[1].find('a', class_='cmc-link').text.strip()
except:
name = ''
try:
url = 'https://coinmarketcap.com' + str(tds[1].find('a', class_='cmc-link').get('href'))
except:
url = ''
try:
price = tr.find('td', class_='cmc-table__cell--sort-by__price').find('a').text.strip().replace('$', '')
except:
price = ''
data = {'name': name,
'url': url,
'price': price}
writer_csv(data)
def main_1():
url_1 = 'https://coinmarketcap.com/'
get_data_page(get_html(url_1))
def main_2():
url_2 = 'https://coinmarketcap.com/2/'
while True:
get_data_page(get_html(url_2))
soup = BeautifulSoup(get_html(url_2), 'lxml')
try:
pattern = 'Next '
url_2 = 'https://coinmarketcap.com' + str(soup.find('ul', class_='pagination').find('a', text=re.compile(pattern)).get('href'))
except:
break
main_1()
main_2()

how to get products names from amazon

I saw all the relevant previous topics about that manner and i've learned a lot (especially about the difference between lxml and html.parser)
anyway after i changed my BeautifulSoup to parse the page as lxml I still can't get all the time the same result .
most of the time i get : " name = soup.find('span', id="productTitle").text
AttributeError: 'NoneType' object has no attribute 'text' "
but in a few times i get the real name of the product.
what am I missing?
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/Homego-Bluetooth-Wrist-Smart-Handsfree/dp/B01DOULDN0/ref=sr_1_1?keywords=smart+watch&qid=1569450390&sr=8-1"
client = requests.get(url, headers={"User-Agent": "Defined"})
try:
client.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Error!!!!" + str(e))
soup = BeautifulSoup(client.content, 'lxml')
name = soup.find('span', id="productTitle").text
title = name[161:len(name)-128]
print("the title is: ", title)
if __name__ == '__main__':
main()

Presumably you are getting different html back. Check the html itself. You can add in a test of whether variables are None before attempting to access .text
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/Homego-Bluetooth-Wrist-Smart-Handsfree/dp/B01DOULDN0/ref=sr_1_1?keywords=smart+watch&qid=1569450390&sr=8-1"
client = requests.get(url, headers={"User-Agent": "Defined"})
try:
client.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Error!!!!" + str(e))
soup = BeautifulSoup(client.content, 'lxml')
name = soup.find('span', id="productTitle")
if name is None:
name = 'N/A'
title = 'n/a'
else:
name = name.text
title = name[161:len(name)-128]
print("the title is: ", title)
if __name__ == '__main__':
main()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

I have issue with data scraping with python - python

Related

How to get all listings urls from main page with python web scraping

How to scrape multiple result having same tags and class

How to extract data (title, email link, location) from first 10 pages using python

I am trying to parse data from all pages. Only the first page is parsed

how to get products names from amazon

Categories

Resources