I'm getting KeyError trying to scrape data from website - python

I wrote a code for data scraping; it works well for some pages, but for some it displays:
KeyError: 'isbn'.
Could you please guide me on how can I solve this issue?
Here is my code:
import requests
import re
import json
from bs4 import BeautifulSoup
import csv
import sys
import codecs
def Soup(content):
soup = BeautifulSoup(content, 'html.parser')
return soup
def Main(url):
r = requests.get(url)
soup = Soup(r.content)
scripts = soup.findAll("script", type="application/ld+json",
text=re.compile("data"))
prices = [span.text for span in soup.select(
"p.product-field.price span span") if span.text != "USD"]
with open("AudioBook/Fiction & Literature/African American.csv", 'a', encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Title", "Writer", "Price", "IMG", "URL", "ISBN"])
for script, price in zip(scripts, prices):
script = json.loads(script.text)
title = script["data"]["name"]
author = script["data"]["author"][0]["name"]
img = f'https:{script["data"]["thumbnailUrl"]}'
isbn = script["data"]["isbn"]
url = script["data"]["url"]
writer.writerow([title, author, price, img, url, isbn])
for x in range(1,10):
url = ("https://www.kobo.com/ww/en/audiobooks/contemporary-1?pageNumber=" + str(x))
print("Scrapin page " + str(x) + ".....")
Main(url)

Since audiobooks don't have an ISBN on the listings page, you could prepare for this case with a default value, e.g.:
isbn = script["data"].get("isbn", "")
In this case, if the "isbn" key doesn't exist in script["data"], it will fallback on the value of an empty string.
Alternatively, you could get the book ISBN from the audiobook-specific page (your script["data"]["url"] above), e.g.:
def Main(url):
r = requests.get(url)
soup = Soup(r.content)
scripts = soup.findAll("script", type="application/ld+json",
text=re.compile("data"))
prices = [span.text for span in soup.select(
"p.product-field.price span span") if span.text != "USD"]
with open("AudioBook/Fiction & Literature/African American.csv", 'a', encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Title", "Writer", "Price", "IMG", "URL", "ISBN"])
for script, price in zip(scripts, prices):
script = json.loads(script.text)
title = script["data"]["name"]
author = script["data"]["author"][0]["name"]
img = f'https:{script["data"]["thumbnailUrl"]}'
# NEW CODE
url = script["data"]["url"]
if "isbn" in script["data"]:
# ebook listings
isbn = script["data"]["isbn"]
else:
# audiobook listings
r = requests.get(url)
inner_soup = Soup(r.content)
try:
inner_script = json.loads(
inner_soup.find("script", type="application/ld+json",
text=re.compile("workExample")).text)
isbn = inner_script["workExample"]["isbn"]
except AttributeError:
isbn = ""
# END NEW CODE
writer.writerow([title, author, price, img, url, isbn])

Related

Unable to write all values of list to CSV row. values are extracting from page and store in variables but missing in File

Values are scraped correctly from webpage but when I am trying to write in CSV as a row company name didn't write in file.
Here is link to webPage : http://search.sunbiz.org/Inquiry/CorporationSearch/SearchResultDetail?inquirytype=EntityName&directionType=Initial&searchNameOrder=A%201421260&aggregateId=domp-142126-360258c3-c08b-4f9a-8866-ad3ecdedef02&searchTerm=A&listNameOrder=A%201421260
import requests
from bs4 import BeautifulSoup
from requests import get
from json import loads
import csv
def writeFile(data):
with open("A.csv", "a") as file:
writer = csv.writer(file)
writer.writerow(data)
def company_page(url):
docnum, fenum, fdate, edate, status, levent, companyName, year = '','','','','','','',''
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
# print(soup.prettify())
reqDiv = soup.find("div", class_ = "detailSection filingInformation")
companyName = soup.find("div", class_ = "detailSection corporationName").text
for lab in reqDiv.select("label"):
if(lab.text == 'Document Number'):
docnum = lab.find_next_sibling().text
if(lab.text == 'FEI/EIN Number'):
fenum = lab.find_next_sibling().text
if(lab.text == 'Date Filed'):
fdate = lab.find_next_sibling().text
if(lab.text == 'Effective Date'):
edate = lab.find_next_sibling().text
year = lab.find_next_sibling().text.split('/')[2]
if(lab.text == 'Status'):
status = lab.find_next_sibling().text
if(lab.text == 'Last Event'):
levent = lab.find_next_sibling().text
else:
continue
details = []
details.append(companyName)
details.append(docnum)
details.append(fenum)
details.append(fdate)
details.append(edate)
details.append(status)
details.append(levent)
writeFile(details)
Getting the following values in csv [![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/acUjN.png
I want company name also in csv file. It stores in variable companyName but can't write in csv.
import requests
from bs4 import BeautifulSoup
import csv
def Main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
name = soup.find("div", class_="detailSection corporationName").get_text(
strip=True, separator=" ")
data = [item.string for item in soup.select(
"div.detailSection.filingInformation span")]
del data[5]
# print(data[2:-2])
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Document Number",
"FEI/EIN Number", "Date Filed", "Status", "Last Event"])
writer.writerow([name, *data[2:-2]])
Main("http://search.sunbiz.org/Inquiry/CorporationSearch/SearchResultDetail?inquirytype=EntityName&directionType=Initial&searchNameOrder=A%201421260&aggregateId=domp-142126-360258c3-c08b-4f9a-8866-ad3ecdedef02&searchTerm=A&listNameOrder=A%201421260")
Output: view-online

How to get all listings urls from main page with python web scraping

I wrote a code for web scraping, My code is ok just except two issues. From detail page, everything is ok just ISBN NO, and from main page, I need all listing URLs so that my code could scrape date from aa listings. Please guide me how can I fix this issue. Both(main page and details page )URLs are in the code. Thank you!
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('span',class_="title product-field",id=False).text
except:
title = 'empty'
print(title)
try:
writer = soup.find('a',class_="contributor-name",id=False).text
except:
writer = 'empty'
print(writer)
try:
original_price = soup.find('div',class_="original-price",id=False).find('span').text
except:
original_price = 'empty'
print(original_price)
try:
active_price = soup.find('div',class_="active-price",id=False).find('span').text
except:
active_price = 'empty'
print(active_price)
try:
img = soup.find('div',class_="image-actions image-container product-type-icon-container book",id=False).find('img').attrs['src']
except:
img = 'empty'
print(img)
try:
isbn = soup.find('div',class_="bookitem-secondary-metadata",id=False).find('li').attrs['ISBN: ']
except:
isbn = 'empty'
print(isbn)
data = {
'title' : title,
'writer' : writer,
'original_price' : original_price,
'active_price' : active_price,
'image' : img,
'isbn' : isbn
}
return data
def get_index_data(soup):
titles_link = soup.find_all('a',class_="body_link_11")
try:
inks = soup.find('div', class_="item-info",id=False).find('p').find('a').get('href')
except:
inks = "empty"
print(inks)
def main():
#detail_page_url = "https://www.kobo.com/ww/en/ebook/mum-dad-1"
mainurl = "https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q"
#get_page(url)
#get_detail_data(get_page(detail_page_url))
get_index_data(get_page(mainurl))
if __name__ == '__main__':
main()
import requests
import re
import json
from bs4 import BeautifulSoup
import csv
def Soup(content):
soup = BeautifulSoup(content, 'html.parser')
return soup
def Main(url):
r = requests.get(url)
soup = Soup(r.content)
scripts = soup.findAll("script", type="application/ld+json",
text=re.compile("data"))
prices = [span.text for span in soup.select(
"p.product-field.price span span") if span.text != "USD"]
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Title", "Writer", "Price", "ISBN", "IMG", "URL"])
for script, price in zip(scripts, prices):
script = json.loads(script.text)
title = script["data"]["name"]
author = script["data"]["author"][0]["name"]
img = f'https:{script["data"]["thumbnailUrl"]}'
isbn = script["data"]["isbn"]
url = script["data"]["url"]
writer.writerow([title, author, price, isbn, img, url])
Main("https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q")
Output: View-Online
Output Sample:

Scrape web sites with unique url (python)

I am currently working on a project of web scraping but i have difficulties with the url of the web site, because it's not changing when i'm going through the pages.
The website: https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail
My goal is to scrape all the buildings in the two pages.
The only way i can scrape the data is by using the inspect tool and copy the wrapper around all the ads.
This is my code:
from bs4 import BeautifulSoup
import requests
import csv
import string
import glob
#Grab the soup (content)
source = requests.get("https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail")
soup = BeautifulSoup(source.content, 'html.parser')
#Loop through all the ads on the page
for ad in soup.find_all('div', {"data-id":"templateThumbnailItem"}):
if (soup.find('div', {"class":"price"})):
#Get the address
address = ad.find('span', {"class":"address"})
address = address.findChild().text
address = address.strip()
#Get the district
district = ad.find('span', {"class":"address"})
district = district.findChildren()[1].text
district = district.strip()
#Get the type
typeBuilding = ad.find('span', {"class":"category"}).text
typeBuilding = typeBuilding.strip()
typeBuilding = typeBuilding[0:7].strip()
#Get the Price
price = ad.find('span', {"itemprop":"price"}).text
price = price.replace('$','')
price = price.replace(u'\xa0','')
price = int(str(price))
cnt = cnt + 1
print(f'Adresse: {address}, Quartier: {district}, Type: {typeBuilding}, Prix: {price}$')
Thank you for helping!
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
with requests.Session() as req:
r = req.get(
"https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail")
with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Address", "Quartier", "Type", "Price"])
for num in range(0, 40, 20):
data = {'startPosition': num}
r = req.post(url, json=data).json()
html = r["d"]["Result"]["html"]
soup = BeautifulSoup(html, 'html.parser')
prices = [format(int(price.get("content")), ',d') for price in soup.findAll(
"span", itemprop="price")]
block = soup.findAll("div", class_="location-container")
ty = [ty.div.get_text(strip=True) for ty in block]
add = [add.select_one(
"span.address div").text for add in block]
quartier = [quar.select_one(
"span.address div:nth-child(2)").text for quar in block]
final = zip(add, quartier, ty, prices)
writer.writerows(final)
main("https://www.centris.ca/Mvc/Property/GetInscriptions")
Output: View Online

I am trying to parse data from all pages. Only the first page is parsed

I am trying to parse data from all pages. Parsing ends after the first page. What could be the problem?
I use pagination with the use of a regular expression.
The first page of the site and others differ in the html code, so I have to create two different functions main_1 and main_2 for the first and other pages.
If you try to run only the main_2 function, nothing will work. .CSV file will not be created.
help me please.
import requests
from bs4 import BeautifulSoup
import csv
import re
def get_html(url):
r = requests.get(url)
if r.ok:
return r.text
print(r.status_code)
def writer_csv(data):
with open('tesr.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow((data['name'], data['url'], data['price']))
def get_data_page(html):
soup = BeautifulSoup(html, 'lxml')
trs = soup.find_all('tr', class_='cmc-table-row')
for tr in trs:
tds = tr.find_all('td')
try:
name = tds[1].find('a', class_='cmc-link').text.strip()
except:
name = ''
try:
url = 'https://coinmarketcap.com' + str(tds[1].find('a', class_='cmc-link').get('href'))
except:
url = ''
try:
price = tr.find('td', class_='cmc-table__cell--sort-by__price').find('a').text.strip().replace('$', '')
except:
price = ''
data = {'name': name,
'url': url,
'price': price}
writer_csv(data)
def main_1():
url_1 = 'https://coinmarketcap.com/'
get_data_page(get_html(url_1))
def main_2():
url_2 = 'https://coinmarketcap.com/2/'
while True:
get_data_page(get_html(url_2))
soup = BeautifulSoup(get_html(url_2), 'lxml')
try:
pattern = 'Next '
url_2 = 'https://coinmarketcap.com' + str(soup.find('ul', class_='pagination').find('a', text=re.compile(pattern)).get('href'))
except:
break
main_1()
main_2()

Python write of scraping data to csv file

I wrote simple code which scrape data from website but i'm struggling to save all rows to csv file. Finished script save only one row - it's last occurance in loop.
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Company", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
#print title, company, ,price_netto
f.writerow([title.encode("utf-8"), company, price_netto, ])
Important is to save data to concurrent columns
#PadraicCunningham This is my whole script:
import requests
from bs4 import BeautifulSoup
import csv
url_klocki = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
r = requests.get(url_klocki)
soup = BeautifulSoup(r.content, "html.parser")
def main_spider(max_page):
page = 1
while page <= max_page:
url = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
source_code = requests.get(url)
soup = BeautifulSoup(source_code.content, "html.parser")
for link in soup.find_all('article', class_='small-product'):
url = "http://www.selgros24.pl"
a = link.findAll('a')[0].get('href')
href = url + a
#print href
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Comapny", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
print title, company, price_netto
f.writerow([title.encode("utf-8"), company, price_netto])
main_spider(1)
The problem is that you are opening the output file in get_single_item_data, and it is getting closed when that function returns and f goes out of scope.
You want to pass an open file in to get_single_item_data so multiple rows will be written.

Categories

Resources