i m building a webscraper and a bit stuck trying to manipulate the data i get out of bs4.
i m trying to get the text of the ('div', class_='listing__content__wrapper') nice organized into their 4 headers (headerList = ['streetName', 'city', 'province', 'postalCode'])
i got as far as getting it into a csv file but I can't get it into rows and columns.
All the help I can get is appreciated.
here is my code so far:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import csv
headers = {
"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
# we can ask for the url of the page you want to scrape here, remove after tests are successful.
# url = input("Enter url to scrape: ")
# for testing
url = 'https://www.yellowpages.ca/search/si/1/gym/Toronto+ON'
page = requests.get(url, headers=headers)
# tag and class of interest to parse
parse_only = SoupStrainer(
'div', class_='listing__content__wrapper')
soup = BeautifulSoup(page.content, 'html.parser', parse_only=parse_only)
streetaddress = (soup.find_all('span', class_='jsMapBubbleAddress'))
with open('test.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for line in streetaddress:
writer.writerow(line.get_text())
# using a function works but still can't get all the data under the 4 columns/headerList
def append_to_csv(input_string):
with open("test2.csv", "a") as csv_file:
csv_file.write(input_string.get_text().strip() +
"|")
for line in streetaddress:
append_to_csv(line)
# for listing in streetaddress:
# print((listing.get_text()), file=open('streetaddresses.csv', 'a'), sep='|')
I think this will do what you want.
fields = ['streetAddress','addressLocality','addressRegion','postalCode']
gather = {}
with open('test.csv', 'a') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fields)
writer.writeheader()
for line in streetaddress:
gather[line.attrs["itemprop"]] = line.get_text()
if line.attrs["itemprop"] == "postalCode":
writer.writerow(gather)
gather = {}
Related
I am new to python and am learning things slowly. I have earlier performed API calls from databases to extract infromation. However, I was dealing with a particular Indian database. The html script seems confusing to extract the particular infromation I am looking for. Basically, I have a list of herb name links as input which looks like this(only the ID changes):
http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8
http://envis.frlht.org/plantdetails/2133/fd01bd598f0869d65fe5a2861845f9f9
http://envis.frlht.org/plantdetails/845/fd01bd598f0869d65fe5a2861845f9f10
http://envis.frlht.org/plantdetails/363/fd01bd598f0869d65fe5a2861845f9f11
When I open each of this, I want to extract the "Distribution" detail for these herb links from the webpage. That's all. But, in the html script, I cant figure which header has the detail. I tried a lot before coming here. Can someone please help me. Thanks in advance.
Code:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
from pathlib import Path
from pprint import pprint
user_home = os.path.expanduser('~')
OUTPUT_DIR = os.path.join(user_home, 'vk_frlht')
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
herb_url = 'http://envis.frlht.org/bot_search'
response = requests.get(herb_url)
soup = BeautifulSoup(response.text, "html.parser")
token = soup.find('Type Botanical Name', {'type': 'hidden', 'name': 'token'})
herb_query_url = 'http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8'
response = requests.get('http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8')
#optional code for many links at once
with open(Path, 'r') as f:
frlhtinput = f.readlines()
frlht = [x[:-1] for x in frlhtinput]
for line in frlht:
out = requests.get(f'http://envis.frlht.org/plantdetails/{line}')
#end of the optional code
herb_query_soup = BeautifulSoup(response.text, "html.parser")
text = herb_query_soup.find('div', {'id': 'result-details'})
pprint(text)
This is how this page looks after scrapping:
Loading sign in the middle means that content can be loaded only after JavaScript code executes. Meaning someone protected this content with JS code. You have to use Selenium browser instead of BS4.
See tutorial here on how to use it.
Try it.
import requests
from bs4 import BeautifulSoup
from pprint import pprint
plant_ids = ["3315", "2133", "845", "363"]
results = []
for plant_id in plant_ids:
herb_query_url = f"http://envis.frlht.org/plantdetails/{plant_id}/fd01bd598f0869d65fe5a2861845f9f8"
headers = {
"Referer": herb_query_url,
}
response = requests.get(
f"http://envis.frlht.org/bot_search/plantdetails/plantid/{plant_id}/nocache/0.7763327765552295/referredfrom/extplantdetails",
headers=headers,
)
herb_query_soup = BeautifulSoup(response.text, "html.parser")
result = herb_query_soup.findAll("div", {"class": "initbriefdescription"})
for r in result:
result_dict = {r.text.split(":", 1)[0].strip(): r.text.split(":", 1)[1].strip()}
results.append(result_dict)
pprint(results)
enter code here
import requests
from bs4 import BeautifulSoup
import csv
fieldnames = ["ID", "Accepted Name", "Family", "Used in", "Distribution"]
with open('IDs.txt') as f_input, open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
for line in f_input:
url = line.strip() # Remove newline
print(url)
url_split = url.split('/')
url_details = f"http://envis.frlht.org/bot_search/plantdetails/plantid/{url_split[4]}/nocache/{url_split[5]}/referredfrom/extplantdetails"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Referer' : url,
}
req = requests.get(url_details, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
row = {field : '' for field in fieldnames} # default values
row['ID'] = url_split[4]
result = soup.findAll("div", {"class": "initbriefdescription"})
for r in result:
result_dict = r.get_text(strip=True).split(":" ,1)
results.append(result_dict)
row[entry] = results
print(row)
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
csv_output.writerow(row)
The information is obtained from another URL based on the URLs you have. First you need to construct the required URL (which was found looking at the browser) and requesting that.
This information could be written to a CSV file as follows. It assumes you have a text file IDs.txt as follows:
http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8
http://envis.frlht.org/plantdetails/2133/fd01bd598f0869d65fe5a2861845f9f9
http://envis.frlht.org/plantdetails/845/fd01bd598f0869d65fe5a2861845f9f10
http://envis.frlht.org/plantdetails/363/fd01bd598f0869d65fe5a2861845f9f11
import requests
from bs4 import BeautifulSoup
import csv
fieldnames = ["ID", "Accepted Name", "Family", "Used in", "Distribution"]
with open('IDs.txt') as f_input, open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
for line in f_input:
url = line.strip() # Remove newline
print(url)
url_split = url.split('/')
url_details = f"http://envis.frlht.org/bot_search/plantdetails/plantid/{url_split[4]}/nocache/{url_split[5]}/referredfrom/extplantdetails"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Referer' : url,
}
req = requests.get(url_details, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
row = {field : '' for field in fieldnames} # default values
row['ID'] = url_split[4]
for div in soup.find_all('div', class_="initbriefdescription"):
entry, value = div.get_text(strip=True).split(":" ,1)
row[entry] = value
csv_output.writerow(row)
Giving an output starting:
ID,Accepted Name,Family,Used in,Distribution
3315,Amaranthus hybridusL. subsp.cruentusvar.paniculatusTHELL.,AMARANTHACEAE,"Ayurveda, Siddha, Folk","This species is globally distributed in Africa, Asia and India. It is said to be cultivated as a leafy vegetable in Maharashtra, Karnataka (Coorg) and on the Nilgiri hills of Tamil Nadu. It is also found as an escape."
2133,Triticum aestivumL.,POACEAE,"Ayurveda, Siddha, Unani, Folk, Chinese, Modern",
845,Dolichos biflorusL.,FABACEAE,"Ayurveda, Siddha, Unani, Folk, Sowa Rigpa","This species is native to India, globally distributed in the Paleotropics. Within India, it occurs all over up to an altitude of 1500 m. It is an important pulse crop particularly in Madras, Mysore, Bombay and Hyderabad."
363,Brassica oleraceaL.,BRASSICACEAE,"Ayurveda, Siddha",
I have a bunch of urls in a csv file and I have to extract data from those urls into another csv file. I extracted the data from those urls into a dataframe using my code below, but when it comes to save those extracted data into output csv, it only shows me the last extracted data (i.e if I have 10 urls in demo.csv, only the extracted data of 10th url is seen in the output csv, not all the urls' data).
import csv
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
with open('demo.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
url = row[0]
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36/8mqHiSuL-56"}
response = requests.get(url, headers= header)
print(url)
soup = BeautifulSoup(response.content, "html.parser")
website= soup.find('div', class_="arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-middle__373c0__2TQsQ border-color--default__373c0__2oFDT")
if website is None:
website = '-'
else:
website = website.text.replace('Business website','')
print(website)
time.sleep(2)
dict = {'url': [url], 'website': [website]}
df = pd.DataFrame(dict)
df.to_csv('export_dataframe.csv', index= False)
The problem seems to be the identation of the line where you are adding your data into a dict. It is outside the loop, therefore only adds the last url data. I have pointed it out by comment in the code below.
import csv
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
with open('demo.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
url = row[0]
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36/8mqHiSuL-56"}
response = requests.get(url, headers= header)
print(url)
soup = BeautifulSoup(response.content, "html.parser")
website= soup.find('div', class_="arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-middle__373c0__2TQsQ border-color--default__373c0__2oFDT")
if website is None:
website = '-'
else:
website = website.text.replace('Business website','')
print(website)
time.sleep(2)
data.append([url, website]) # this line is out of loop in your code, also I am using list here just to simplify (you can use dict still)
df = pd.DataFrame(data, columns=['url','website'])
df.to_csv('export_dataframe.csv', index= False)
from bs4 import BeautifulSoup
import requests
import csv
class Parse():
def __init__(self):
self.row_list = []
self.base_url ='https://www.tripadvisor.co.uk'
def parse(self,url): # correct
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'}
response = requests.get(url,headers).text
soup = BeautifulSoup(response,'html.parser')
next_link = soup.find('a',class_='_23XJjgWS _1hF7hP_9 _2QvUxWyA')
next_page = self.base_url+next_link.attrs['href']
cards = soup.find_all('section',class_='_2TabEHya _3YhIe-Un')
for card in cards:
name = card.find('div',class_='_1gpq3zsA _1zP41Z7X').text
rating = str(card.find('svg',class_='zWXXYhVR'))
rating = self.remove(filter_col=rating)
review_count = card.find('span',class_='DrjyGw-P _26S7gyB4 _14_buatE _1dimhEoy').text
status = card.find('div',class_='DrjyGw-P _26S7gyB4 _3SccQt-T').text
row_list = [name,rating,status,review_count]
return next_page,row_list
def remove(self,filter_col):
rating = filter_col.split(' ')[1]
rating = rating[-3:]
return rating
def write_csv(self,row_list):
with open('top_sites.csv','w') as file:
csv_writer = csv.writer(file, delimiter=',')
csv_writer.writerows(row_list)
if __name__=='__main__':
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html"
parsing = Parse()
next_url,row_list = parsing.parse(url=url)
print(next_url)
PS C:\Users\Caspe\PycharmProjects\Selenium Test> & "c:/Users/Caspe/PycharmProjects/Selenium Test/.venv/Scripts/python.exe" "c:/Users/Caspe/PycharmProjects/Selenium Test/Demo/tripadvisor_topattract.py"
https://www.tripadvisor.co.uk/Attractions-g294190-Activities-Myanmar.html
PS C:\Users\Caspe\PycharmProjects\Selenium Test>
I'm trying to scrape data from TripAdvisor Website using BeautifulSoup.
Link: https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html
Instead of going to next page, the link is repeated itself. Is there a solution for my problem?
I've selected the correct selector for the soup and I was able to scrape data.
To get pagination working, it's necessary to change the -oa<index>- part in URL:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa{}-Myanmar.html"
data = []
for page in range(0, 4): # <--- increase page count here
print("Getting page {}..".format(page))
soup = BeautifulSoup(
requests.get(url.format(page * 30)).content, "html.parser"
)
titles = soup.select('span[name="title"]')
for title in titles:
no, t = title.get_text(strip=True, separator="|").split("|")
rating = title.find_next("svg")
review_count = rating.find_next("span")
data.append(
(
no,
t,
rating["title"],
review_count.text,
review_count.find_next(
"div", class_="DrjyGw-P _26S7gyB4 _3SccQt-T"
).text,
)
)
with open("data.csv", "w") as f_out:
w = csv.writer(f_out)
w.writerows(data)
Writes data.csv (screenshot from LibreOffice):
I just tried to run the code below. I got no error message, but no data was actually written to the CSV. I looked at the website and I found both snapshot-td2-cp and snapshot-td2 elements. When I remove the writer.writerow statements and use print statements, I see six number 2 characters, and that's it.
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except:
print("{} - not found".format(url))
In the SBUX example, I want to get data from this table.
I tested this code a few months ago, and everything worked fine. Can someone point out my mistake? I'm not seeing it. Thanks.
To get the data, specify User-Agent in your requests.
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [(s, url_base + s) for s in tckr]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
with open('data.csv', 'w') as f_out:
writer = csv.writer(f_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for t, url in url_list:
print('Scrapping ticker {}...'.format(t))
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
writer.writerow([t])
for row in soup.select('.snapshot-table2 tr'):
writer.writerow([td.text for td in row.select('td')])
Prints:
Scrapping ticker SBUX...
Scrapping ticker MSFT...
Scrapping ticker AAPL...
and saves data.csv (Screenshot from LibreOffice):
I already done scraping of wikipedia's infobox but I don't know how to store taht data in csv file. Please help me out.
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
def infobox(query) :
query = query
url = 'https://en.wikipedia.org/wiki/'+query
raw = urlopen(url)
soup = bs(raw)
table = soup.find('table',{'class':'infobox vcard'})
for tr in table.find_all('tr') :
print(tr.text)
infobox('Infosys')
You have to collect the required data and write in csv file, you can use csv module see below example:
from bs4 import BeautifulSoup as bs
from urllib import urlopen
import csv
def infobox(query) :
query = query
content_list = []
url = 'https://en.wikipedia.org/wiki/'+query
raw = urlopen(url)
soup = bs(raw)
table = soup.find('table',{'class':'infobox vcard'})
for tr in table.find_all('tr') :
if len(tr.contents) > 1:
content_list.append([tr.contents[0].text.encode('utf-8'), tr.contents[1].text.encode('utf-8')])
elif tr.text:
content_list.append([tr.text.encode('utf-8')])
write_csv_file(content_list)
def write_csv_file(content_list):
with open(r'd:\Test.csv', mode='wb') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerows(content_list)
infobox('Infosys')
Here is an outline of how you can test whether the row has a header and a table cell element within it to ensure two columns (you can expand to write td only rows to populate perhaps the first column within the if structure). I use slightly different encoding syntax for cleaner output, select for faster element selection than find and utilize pandas to generate the csv.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
url = 'https://en.wikipedia.org/wiki/'+ 'Infosys'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Referer': 'https://www.nseindia.com/'}
r = requests.get(url, headers=headers)
soup = bs(r.content,'lxml')
table =soup.select_one('.infobox.vcard')
rows = table.find_all('tr')
output = []
for row in rows:
if len(row.select('th, td')) == 2:
outputRow = [row.select_one('th').text, row.select_one('td').text, [item['href'] for item in row.select('td a')] if row.select_one('td a') is not None else '']
outputRow[2] = ['https://en.wikipedia.org/wiki/Infosys' + item if item[0] == '#' else 'https://en.wikipedia.org' + item for item in outputRow[2]]
output.append(outputRow)
df = pd.DataFrame(output)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )