Web scraping from html code of a database using python - python

I am new to python and am learning things slowly. I have earlier performed API calls from databases to extract infromation. However, I was dealing with a particular Indian database. The html script seems confusing to extract the particular infromation I am looking for. Basically, I have a list of herb name links as input which looks like this(only the ID changes):
http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8
http://envis.frlht.org/plantdetails/2133/fd01bd598f0869d65fe5a2861845f9f9
http://envis.frlht.org/plantdetails/845/fd01bd598f0869d65fe5a2861845f9f10
http://envis.frlht.org/plantdetails/363/fd01bd598f0869d65fe5a2861845f9f11
When I open each of this, I want to extract the "Distribution" detail for these herb links from the webpage. That's all. But, in the html script, I cant figure which header has the detail. I tried a lot before coming here. Can someone please help me. Thanks in advance.
Code:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
from pathlib import Path
from pprint import pprint
user_home = os.path.expanduser('~')
OUTPUT_DIR = os.path.join(user_home, 'vk_frlht')
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
herb_url = 'http://envis.frlht.org/bot_search'
response = requests.get(herb_url)
soup = BeautifulSoup(response.text, "html.parser")
token = soup.find('Type Botanical Name', {'type': 'hidden', 'name': 'token'})
herb_query_url = 'http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8'
response = requests.get('http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8')
#optional code for many links at once
with open(Path, 'r') as f:
frlhtinput = f.readlines()
frlht = [x[:-1] for x in frlhtinput]
for line in frlht:
out = requests.get(f'http://envis.frlht.org/plantdetails/{line}')
#end of the optional code
herb_query_soup = BeautifulSoup(response.text, "html.parser")
text = herb_query_soup.find('div', {'id': 'result-details'})
pprint(text)

This is how this page looks after scrapping:
Loading sign in the middle means that content can be loaded only after JavaScript code executes. Meaning someone protected this content with JS code. You have to use Selenium browser instead of BS4.
See tutorial here on how to use it.

Try it.
import requests
from bs4 import BeautifulSoup
from pprint import pprint
plant_ids = ["3315", "2133", "845", "363"]
results = []
for plant_id in plant_ids:
herb_query_url = f"http://envis.frlht.org/plantdetails/{plant_id}/fd01bd598f0869d65fe5a2861845f9f8"
headers = {
"Referer": herb_query_url,
}
response = requests.get(
f"http://envis.frlht.org/bot_search/plantdetails/plantid/{plant_id}/nocache/0.7763327765552295/referredfrom/extplantdetails",
headers=headers,
)
herb_query_soup = BeautifulSoup(response.text, "html.parser")
result = herb_query_soup.findAll("div", {"class": "initbriefdescription"})
for r in result:
result_dict = {r.text.split(":", 1)[0].strip(): r.text.split(":", 1)[1].strip()}
results.append(result_dict)
pprint(results)

enter code here
import requests
from bs4 import BeautifulSoup
import csv
fieldnames = ["ID", "Accepted Name", "Family", "Used in", "Distribution"]
with open('IDs.txt') as f_input, open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
for line in f_input:
url = line.strip() # Remove newline
print(url)
url_split = url.split('/')
url_details = f"http://envis.frlht.org/bot_search/plantdetails/plantid/{url_split[4]}/nocache/{url_split[5]}/referredfrom/extplantdetails"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Referer' : url,
}
req = requests.get(url_details, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
row = {field : '' for field in fieldnames} # default values
row['ID'] = url_split[4]
result = soup.findAll("div", {"class": "initbriefdescription"})
for r in result:
result_dict = r.get_text(strip=True).split(":" ,1)
results.append(result_dict)
row[entry] = results
print(row)
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
csv_output.writerow(row)

The information is obtained from another URL based on the URLs you have. First you need to construct the required URL (which was found looking at the browser) and requesting that.
This information could be written to a CSV file as follows. It assumes you have a text file IDs.txt as follows:
http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8
http://envis.frlht.org/plantdetails/2133/fd01bd598f0869d65fe5a2861845f9f9
http://envis.frlht.org/plantdetails/845/fd01bd598f0869d65fe5a2861845f9f10
http://envis.frlht.org/plantdetails/363/fd01bd598f0869d65fe5a2861845f9f11
import requests
from bs4 import BeautifulSoup
import csv
fieldnames = ["ID", "Accepted Name", "Family", "Used in", "Distribution"]
with open('IDs.txt') as f_input, open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
for line in f_input:
url = line.strip() # Remove newline
print(url)
url_split = url.split('/')
url_details = f"http://envis.frlht.org/bot_search/plantdetails/plantid/{url_split[4]}/nocache/{url_split[5]}/referredfrom/extplantdetails"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Referer' : url,
}
req = requests.get(url_details, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
row = {field : '' for field in fieldnames} # default values
row['ID'] = url_split[4]
for div in soup.find_all('div', class_="initbriefdescription"):
entry, value = div.get_text(strip=True).split(":" ,1)
row[entry] = value
csv_output.writerow(row)
Giving an output starting:
ID,Accepted Name,Family,Used in,Distribution
3315,Amaranthus hybridusL. subsp.cruentusvar.paniculatusTHELL.,AMARANTHACEAE,"Ayurveda, Siddha, Folk","This species is globally distributed in Africa, Asia and India. It is said to be cultivated as a leafy vegetable in Maharashtra, Karnataka (Coorg) and on the Nilgiri hills of Tamil Nadu. It is also found as an escape."
2133,Triticum aestivumL.,POACEAE,"Ayurveda, Siddha, Unani, Folk, Chinese, Modern",
845,Dolichos biflorusL.,FABACEAE,"Ayurveda, Siddha, Unani, Folk, Sowa Rigpa","This species is native to India, globally distributed in the Paleotropics. Within India, it occurs all over up to an altitude of 1500 m. It is an important pulse crop particularly in Madras, Mysore, Bombay and Hyderabad."
363,Brassica oleraceaL.,BRASSICACEAE,"Ayurveda, Siddha",

Related

BS4 - 'NoneType' object has no attribute 'findAll' when scanning spans on amazon page

I'm following a Udemy course on learning BS4 and it seems to be a bit outdated so I'm having trouble with this part.
The objective is to scrape the price of this TV from this amazon page, and in the course the instructor also gets this error and says he fixes it by changing the class name he's searching for via findAll. I tried the same thing (meaning different class not the same one he used) and was met again with the attribute error. According to the answer for a similar issue, the class being searched for didn't contain what was being looked for, but I don't believe the same is happening to me.
The code: https://pastebin.com/SMQBXt31
`
from datetime import datetime
import requests
import csv
import bs4
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15"
REQUEST_HEADER = {
"User-Agent": USER_AGENT,
"Accept-Language": "en-US, en;q=0.5"
}
def get_page_html(url):
res = requests.get(url=url, headers=REQUEST_HEADER) #res = response
return res.content
def get_product_price(soup):
main_price_span = soup.find("span", attrs={
"class": "a-price aok-align-center reinventPricePriceToPayPadding priceToPay"
})
price_spans = main_price_span.findAll("span")
for span in price_spans:
price = span.text.strip().replace("$", "").replace(",", "")
print(price)
def extract_product_info(url):
product_info = {}
print(f"Scraping URL: {url}")
html = get_page_html(url)
soup = bs4.BeautifulSoup(html, "lxml")
product_info["price"] = get_product_price(soup)
if __name__ == '__main__':
with open("amazon_products_urls.csv", newline="") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
for row in reader:
url = row[0]
print(extract_product_info(url))
`
The website:https://www.amazon.com/Hisense-Premium-65-Inch-Compatibility-65U8G/dp/B091XWTGXL/ref=sr_1_1_sspa?crid=3NYCKNFHL6DU2&keywords=hisense%2Bpremium%2B65%2Binch&qid=1651840513&sprefix=hisense%2Bpremium%2B65%2Binch%2B%2Caps%2C116&sr=8-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyVzUyTjBMS1JCVFVRJmVuY3J5cHRlZElkPUEwNDY2ODc0MlozVlFMVFJKQ0s2VyZlbmNyeXB0ZWRBZElkPUEwODI5OTgxMTRZSjdMMzYyQjk4NyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1
There are lot of spans from that you have to select only the price span class correctly which are located in [class="a-size-mini olpWrapper"]
price_spans = main_price_span.find_all("span",class_="a-size-mini olpWrapper")
for span in price_spans:
price = span.text.strip().replace("$", "").replace(",", "")
print(price)
#OR
price_spans =[x.get_text(strip=True).replace("$", "") for x in main_price_span.find("span",class_="a-size-mini olpWrapper")]

converting get_text() output from bs4 into a csv with headers

i m building a webscraper and a bit stuck trying to manipulate the data i get out of bs4.
i m trying to get the text of the ('div', class_='listing__content__wrapper') nice organized into their 4 headers (headerList = ['streetName', 'city', 'province', 'postalCode'])
i got as far as getting it into a csv file but I can't get it into rows and columns.
All the help I can get is appreciated.
here is my code so far:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import csv
headers = {
"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
# we can ask for the url of the page you want to scrape here, remove after tests are successful.
# url = input("Enter url to scrape: ")
# for testing
url = 'https://www.yellowpages.ca/search/si/1/gym/Toronto+ON'
page = requests.get(url, headers=headers)
# tag and class of interest to parse
parse_only = SoupStrainer(
'div', class_='listing__content__wrapper')
soup = BeautifulSoup(page.content, 'html.parser', parse_only=parse_only)
streetaddress = (soup.find_all('span', class_='jsMapBubbleAddress'))
with open('test.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for line in streetaddress:
writer.writerow(line.get_text())
# using a function works but still can't get all the data under the 4 columns/headerList
def append_to_csv(input_string):
with open("test2.csv", "a") as csv_file:
csv_file.write(input_string.get_text().strip() +
"|")
for line in streetaddress:
append_to_csv(line)
# for listing in streetaddress:
# print((listing.get_text()), file=open('streetaddresses.csv', 'a'), sep='|')
I think this will do what you want.
fields = ['streetAddress','addressLocality','addressRegion','postalCode']
gather = {}
with open('test.csv', 'a') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fields)
writer.writeheader()
for line in streetaddress:
gather[line.attrs["itemprop"]] = line.get_text()
if line.attrs["itemprop"] == "postalCode":
writer.writerow(gather)
gather = {}

Trying to get BeautifulSoup to download some data. Getting no error, but nothing is downloaded

I just tried to run the code below. I got no error message, but no data was actually written to the CSV. I looked at the website and I found both snapshot-td2-cp and snapshot-td2 elements. When I remove the writer.writerow statements and use print statements, I see six number 2 characters, and that's it.
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except:
print("{} - not found".format(url))
In the SBUX example, I want to get data from this table.
I tested this code a few months ago, and everything worked fine. Can someone point out my mistake? I'm not seeing it. Thanks.
To get the data, specify User-Agent in your requests.
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [(s, url_base + s) for s in tckr]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
with open('data.csv', 'w') as f_out:
writer = csv.writer(f_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for t, url in url_list:
print('Scrapping ticker {}...'.format(t))
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
writer.writerow([t])
for row in soup.select('.snapshot-table2 tr'):
writer.writerow([td.text for td in row.select('td')])
Prints:
Scrapping ticker SBUX...
Scrapping ticker MSFT...
Scrapping ticker AAPL...
and saves data.csv (Screenshot from LibreOffice):

Want to scrape from a web page and its next pages

I want to Extract Company Name, Person, Country, Phone and Email to an excel file. I tried the following code but it returns only one value in the excel file. How to loop this around the first page and next pages too..
import csv
import re
import requests
import urllib.request
from bs4 import BeautifulSoup
for page in range(10):
url = "http://www.aepcindia.com/buyersdirectory"
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'lxml')
tbody = soup('div', {'class':'view-content'})#[0].find_all('')
f = open('filename.csv', 'w', newline = '')
Headers = "Name,Person,Country,Email,Phone\n"
csv_writer = csv.writer(f)
f.write(Headers)
for i in tbody:
try:
name = i.find("div", {"class":"company_name"}).get_text()
person = i.find("div", {"class":"title"}).get_text()
country = i.find("div", {"class":"views-field views-field-field-country"}).get_text()
email = i.find("div", {"class":"email"}).get_text()
phone = i.find("div", {"class":"telephone_no"}).get_text()
print(name, person, country, email, phone)
f.write("{}".format(name).replace(","," ")+ ",{}".format(person)+ ",{}".format(country)+ ",{}".format(email) + ",{}".format(phone) + "\n")
except: AttributeError
f.close()
Here is the link of the web page
http://www.aepcindia.com/buyersdirectory
import requests
from bs4 import BeautifulSoup
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0'}
def main(url):
with requests.Session() as req:
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Title", "Country", "Email", "Phone"])
for item in range(0, 10):
print(f"Extracting Page# {item +1}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
name = [name.text for name in soup.select("div.company_name")]
title = [title.text for title in soup.select("div.title")]
country = [country.text for country in soup.findAll(
"div", class_="field-content", text=True)]
email = [email.a.text for email in soup.select(
"div.email")]
phone = [phone.text
for phone in soup.select("div.telephone_no")]
data = zip(name, title, country, email, phone)
writer.writerows(data)
main("http://www.aepcindia.com/buyersdirectory?page={}")
Output: view-online

How to scrape wikipedia infobox and store it into a csv file

I already done scraping of wikipedia's infobox but I don't know how to store taht data in csv file. Please help me out.
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
def infobox(query) :
query = query
url = 'https://en.wikipedia.org/wiki/'+query
raw = urlopen(url)
soup = bs(raw)
table = soup.find('table',{'class':'infobox vcard'})
for tr in table.find_all('tr') :
print(tr.text)
infobox('Infosys')
You have to collect the required data and write in csv file, you can use csv module see below example:
from bs4 import BeautifulSoup as bs
from urllib import urlopen
import csv
def infobox(query) :
query = query
content_list = []
url = 'https://en.wikipedia.org/wiki/'+query
raw = urlopen(url)
soup = bs(raw)
table = soup.find('table',{'class':'infobox vcard'})
for tr in table.find_all('tr') :
if len(tr.contents) > 1:
content_list.append([tr.contents[0].text.encode('utf-8'), tr.contents[1].text.encode('utf-8')])
elif tr.text:
content_list.append([tr.text.encode('utf-8')])
write_csv_file(content_list)
def write_csv_file(content_list):
with open(r'd:\Test.csv', mode='wb') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerows(content_list)
infobox('Infosys')
Here is an outline of how you can test whether the row has a header and a table cell element within it to ensure two columns (you can expand to write td only rows to populate perhaps the first column within the if structure). I use slightly different encoding syntax for cleaner output, select for faster element selection than find and utilize pandas to generate the csv.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
url = 'https://en.wikipedia.org/wiki/'+ 'Infosys'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Referer': 'https://www.nseindia.com/'}
r = requests.get(url, headers=headers)
soup = bs(r.content,'lxml')
table =soup.select_one('.infobox.vcard')
rows = table.find_all('tr')
output = []
for row in rows:
if len(row.select('th, td')) == 2:
outputRow = [row.select_one('th').text, row.select_one('td').text, [item['href'] for item in row.select('td a')] if row.select_one('td a') is not None else '']
outputRow[2] = ['https://en.wikipedia.org/wiki/Infosys' + item if item[0] == '#' else 'https://en.wikipedia.org' + item for item in outputRow[2]]
output.append(outputRow)
df = pd.DataFrame(output)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )

Categories

Resources