I want to Extract Company Name, Person, Country, Phone and Email to an excel file. I tried the following code but it returns only one value in the excel file. How to loop this around the first page and next pages too..
import csv
import re
import requests
import urllib.request
from bs4 import BeautifulSoup
for page in range(10):
url = "http://www.aepcindia.com/buyersdirectory"
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'lxml')
tbody = soup('div', {'class':'view-content'})#[0].find_all('')
f = open('filename.csv', 'w', newline = '')
Headers = "Name,Person,Country,Email,Phone\n"
csv_writer = csv.writer(f)
f.write(Headers)
for i in tbody:
try:
name = i.find("div", {"class":"company_name"}).get_text()
person = i.find("div", {"class":"title"}).get_text()
country = i.find("div", {"class":"views-field views-field-field-country"}).get_text()
email = i.find("div", {"class":"email"}).get_text()
phone = i.find("div", {"class":"telephone_no"}).get_text()
print(name, person, country, email, phone)
f.write("{}".format(name).replace(","," ")+ ",{}".format(person)+ ",{}".format(country)+ ",{}".format(email) + ",{}".format(phone) + "\n")
except: AttributeError
f.close()
Here is the link of the web page
http://www.aepcindia.com/buyersdirectory
import requests
from bs4 import BeautifulSoup
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0'}
def main(url):
with requests.Session() as req:
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Title", "Country", "Email", "Phone"])
for item in range(0, 10):
print(f"Extracting Page# {item +1}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
name = [name.text for name in soup.select("div.company_name")]
title = [title.text for title in soup.select("div.title")]
country = [country.text for country in soup.findAll(
"div", class_="field-content", text=True)]
email = [email.a.text for email in soup.select(
"div.email")]
phone = [phone.text
for phone in soup.select("div.telephone_no")]
data = zip(name, title, country, email, phone)
writer.writerows(data)
main("http://www.aepcindia.com/buyersdirectory?page={}")
Output: view-online
Related
I am new to python and am learning things slowly. I have earlier performed API calls from databases to extract infromation. However, I was dealing with a particular Indian database. The html script seems confusing to extract the particular infromation I am looking for. Basically, I have a list of herb name links as input which looks like this(only the ID changes):
http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8
http://envis.frlht.org/plantdetails/2133/fd01bd598f0869d65fe5a2861845f9f9
http://envis.frlht.org/plantdetails/845/fd01bd598f0869d65fe5a2861845f9f10
http://envis.frlht.org/plantdetails/363/fd01bd598f0869d65fe5a2861845f9f11
When I open each of this, I want to extract the "Distribution" detail for these herb links from the webpage. That's all. But, in the html script, I cant figure which header has the detail. I tried a lot before coming here. Can someone please help me. Thanks in advance.
Code:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
from pathlib import Path
from pprint import pprint
user_home = os.path.expanduser('~')
OUTPUT_DIR = os.path.join(user_home, 'vk_frlht')
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
herb_url = 'http://envis.frlht.org/bot_search'
response = requests.get(herb_url)
soup = BeautifulSoup(response.text, "html.parser")
token = soup.find('Type Botanical Name', {'type': 'hidden', 'name': 'token'})
herb_query_url = 'http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8'
response = requests.get('http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8')
#optional code for many links at once
with open(Path, 'r') as f:
frlhtinput = f.readlines()
frlht = [x[:-1] for x in frlhtinput]
for line in frlht:
out = requests.get(f'http://envis.frlht.org/plantdetails/{line}')
#end of the optional code
herb_query_soup = BeautifulSoup(response.text, "html.parser")
text = herb_query_soup.find('div', {'id': 'result-details'})
pprint(text)
This is how this page looks after scrapping:
Loading sign in the middle means that content can be loaded only after JavaScript code executes. Meaning someone protected this content with JS code. You have to use Selenium browser instead of BS4.
See tutorial here on how to use it.
Try it.
import requests
from bs4 import BeautifulSoup
from pprint import pprint
plant_ids = ["3315", "2133", "845", "363"]
results = []
for plant_id in plant_ids:
herb_query_url = f"http://envis.frlht.org/plantdetails/{plant_id}/fd01bd598f0869d65fe5a2861845f9f8"
headers = {
"Referer": herb_query_url,
}
response = requests.get(
f"http://envis.frlht.org/bot_search/plantdetails/plantid/{plant_id}/nocache/0.7763327765552295/referredfrom/extplantdetails",
headers=headers,
)
herb_query_soup = BeautifulSoup(response.text, "html.parser")
result = herb_query_soup.findAll("div", {"class": "initbriefdescription"})
for r in result:
result_dict = {r.text.split(":", 1)[0].strip(): r.text.split(":", 1)[1].strip()}
results.append(result_dict)
pprint(results)
enter code here
import requests
from bs4 import BeautifulSoup
import csv
fieldnames = ["ID", "Accepted Name", "Family", "Used in", "Distribution"]
with open('IDs.txt') as f_input, open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
for line in f_input:
url = line.strip() # Remove newline
print(url)
url_split = url.split('/')
url_details = f"http://envis.frlht.org/bot_search/plantdetails/plantid/{url_split[4]}/nocache/{url_split[5]}/referredfrom/extplantdetails"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Referer' : url,
}
req = requests.get(url_details, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
row = {field : '' for field in fieldnames} # default values
row['ID'] = url_split[4]
result = soup.findAll("div", {"class": "initbriefdescription"})
for r in result:
result_dict = r.get_text(strip=True).split(":" ,1)
results.append(result_dict)
row[entry] = results
print(row)
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
csv_output.writerow(row)
The information is obtained from another URL based on the URLs you have. First you need to construct the required URL (which was found looking at the browser) and requesting that.
This information could be written to a CSV file as follows. It assumes you have a text file IDs.txt as follows:
http://envis.frlht.org/plantdetails/3315/fd01bd598f0869d65fe5a2861845f9f8
http://envis.frlht.org/plantdetails/2133/fd01bd598f0869d65fe5a2861845f9f9
http://envis.frlht.org/plantdetails/845/fd01bd598f0869d65fe5a2861845f9f10
http://envis.frlht.org/plantdetails/363/fd01bd598f0869d65fe5a2861845f9f11
import requests
from bs4 import BeautifulSoup
import csv
fieldnames = ["ID", "Accepted Name", "Family", "Used in", "Distribution"]
with open('IDs.txt') as f_input, open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames, extrasaction='ignore')
csv_output.writeheader()
for line in f_input:
url = line.strip() # Remove newline
print(url)
url_split = url.split('/')
url_details = f"http://envis.frlht.org/bot_search/plantdetails/plantid/{url_split[4]}/nocache/{url_split[5]}/referredfrom/extplantdetails"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Referer' : url,
}
req = requests.get(url_details, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
row = {field : '' for field in fieldnames} # default values
row['ID'] = url_split[4]
for div in soup.find_all('div', class_="initbriefdescription"):
entry, value = div.get_text(strip=True).split(":" ,1)
row[entry] = value
csv_output.writerow(row)
Giving an output starting:
ID,Accepted Name,Family,Used in,Distribution
3315,Amaranthus hybridusL. subsp.cruentusvar.paniculatusTHELL.,AMARANTHACEAE,"Ayurveda, Siddha, Folk","This species is globally distributed in Africa, Asia and India. It is said to be cultivated as a leafy vegetable in Maharashtra, Karnataka (Coorg) and on the Nilgiri hills of Tamil Nadu. It is also found as an escape."
2133,Triticum aestivumL.,POACEAE,"Ayurveda, Siddha, Unani, Folk, Chinese, Modern",
845,Dolichos biflorusL.,FABACEAE,"Ayurveda, Siddha, Unani, Folk, Sowa Rigpa","This species is native to India, globally distributed in the Paleotropics. Within India, it occurs all over up to an altitude of 1500 m. It is an important pulse crop particularly in Madras, Mysore, Bombay and Hyderabad."
363,Brassica oleraceaL.,BRASSICACEAE,"Ayurveda, Siddha",
from bs4 import BeautifulSoup
import requests
import csv
class Parse():
def __init__(self):
self.row_list = []
self.base_url ='https://www.tripadvisor.co.uk'
def parse(self,url): # correct
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'}
response = requests.get(url,headers).text
soup = BeautifulSoup(response,'html.parser')
next_link = soup.find('a',class_='_23XJjgWS _1hF7hP_9 _2QvUxWyA')
next_page = self.base_url+next_link.attrs['href']
cards = soup.find_all('section',class_='_2TabEHya _3YhIe-Un')
for card in cards:
name = card.find('div',class_='_1gpq3zsA _1zP41Z7X').text
rating = str(card.find('svg',class_='zWXXYhVR'))
rating = self.remove(filter_col=rating)
review_count = card.find('span',class_='DrjyGw-P _26S7gyB4 _14_buatE _1dimhEoy').text
status = card.find('div',class_='DrjyGw-P _26S7gyB4 _3SccQt-T').text
row_list = [name,rating,status,review_count]
return next_page,row_list
def remove(self,filter_col):
rating = filter_col.split(' ')[1]
rating = rating[-3:]
return rating
def write_csv(self,row_list):
with open('top_sites.csv','w') as file:
csv_writer = csv.writer(file, delimiter=',')
csv_writer.writerows(row_list)
if __name__=='__main__':
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html"
parsing = Parse()
next_url,row_list = parsing.parse(url=url)
print(next_url)
PS C:\Users\Caspe\PycharmProjects\Selenium Test> & "c:/Users/Caspe/PycharmProjects/Selenium Test/.venv/Scripts/python.exe" "c:/Users/Caspe/PycharmProjects/Selenium Test/Demo/tripadvisor_topattract.py"
https://www.tripadvisor.co.uk/Attractions-g294190-Activities-Myanmar.html
PS C:\Users\Caspe\PycharmProjects\Selenium Test>
I'm trying to scrape data from TripAdvisor Website using BeautifulSoup.
Link: https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html
Instead of going to next page, the link is repeated itself. Is there a solution for my problem?
I've selected the correct selector for the soup and I was able to scrape data.
To get pagination working, it's necessary to change the -oa<index>- part in URL:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa{}-Myanmar.html"
data = []
for page in range(0, 4): # <--- increase page count here
print("Getting page {}..".format(page))
soup = BeautifulSoup(
requests.get(url.format(page * 30)).content, "html.parser"
)
titles = soup.select('span[name="title"]')
for title in titles:
no, t = title.get_text(strip=True, separator="|").split("|")
rating = title.find_next("svg")
review_count = rating.find_next("span")
data.append(
(
no,
t,
rating["title"],
review_count.text,
review_count.find_next(
"div", class_="DrjyGw-P _26S7gyB4 _3SccQt-T"
).text,
)
)
with open("data.csv", "w") as f_out:
w = csv.writer(f_out)
w.writerows(data)
Writes data.csv (screenshot from LibreOffice):
i m building a webscraper and a bit stuck trying to manipulate the data i get out of bs4.
i m trying to get the text of the ('div', class_='listing__content__wrapper') nice organized into their 4 headers (headerList = ['streetName', 'city', 'province', 'postalCode'])
i got as far as getting it into a csv file but I can't get it into rows and columns.
All the help I can get is appreciated.
here is my code so far:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import csv
headers = {
"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
# we can ask for the url of the page you want to scrape here, remove after tests are successful.
# url = input("Enter url to scrape: ")
# for testing
url = 'https://www.yellowpages.ca/search/si/1/gym/Toronto+ON'
page = requests.get(url, headers=headers)
# tag and class of interest to parse
parse_only = SoupStrainer(
'div', class_='listing__content__wrapper')
soup = BeautifulSoup(page.content, 'html.parser', parse_only=parse_only)
streetaddress = (soup.find_all('span', class_='jsMapBubbleAddress'))
with open('test.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for line in streetaddress:
writer.writerow(line.get_text())
# using a function works but still can't get all the data under the 4 columns/headerList
def append_to_csv(input_string):
with open("test2.csv", "a") as csv_file:
csv_file.write(input_string.get_text().strip() +
"|")
for line in streetaddress:
append_to_csv(line)
# for listing in streetaddress:
# print((listing.get_text()), file=open('streetaddresses.csv', 'a'), sep='|')
I think this will do what you want.
fields = ['streetAddress','addressLocality','addressRegion','postalCode']
gather = {}
with open('test.csv', 'a') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fields)
writer.writeheader()
for line in streetaddress:
gather[line.attrs["itemprop"]] = line.get_text()
if line.attrs["itemprop"] == "postalCode":
writer.writerow(gather)
gather = {}
I just tried to run the code below. I got no error message, but no data was actually written to the CSV. I looked at the website and I found both snapshot-td2-cp and snapshot-td2 elements. When I remove the writer.writerow statements and use print statements, I see six number 2 characters, and that's it.
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except:
print("{} - not found".format(url))
In the SBUX example, I want to get data from this table.
I tested this code a few months ago, and everything worked fine. Can someone point out my mistake? I'm not seeing it. Thanks.
To get the data, specify User-Agent in your requests.
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [(s, url_base + s) for s in tckr]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
with open('data.csv', 'w') as f_out:
writer = csv.writer(f_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for t, url in url_list:
print('Scrapping ticker {}...'.format(t))
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
writer.writerow([t])
for row in soup.select('.snapshot-table2 tr'):
writer.writerow([td.text for td in row.select('td')])
Prints:
Scrapping ticker SBUX...
Scrapping ticker MSFT...
Scrapping ticker AAPL...
and saves data.csv (Screenshot from LibreOffice):
I'm trying to scrape through the table at https://bgp.he.net/report/world. I would like to go through each of the HTML links going to country pages, then grab the data and then iterate to the next list. I'm using beautiful soup and can already grab the data the I want, but can't quite figure out how to iterate through the column of HTMLs.
from bs4 import BeautifulSoup
import requests
import json
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/country/LC"
html = requests.get(url, headers=headers)
country_ID = (url[-2:])
print("\n")
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup)
data = []
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
i = 0
with open ('table_attempt.txt', 'w') as r:
for item in data:
r.write(str(data[i]))
i += 1
r.write("\n")
print(data)
I would like to be able to gather the data from each country into one written text file.
I only tested this with the first 3 links (got one error with UnicodeEncodeError but fixed that and commented where that was in the code).
from bs4 import BeautifulSoup
import requests
import json
#First get the list of countries urls
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/report/world"
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
table = soup.find('table', {'id':'table_countries'})
rows = table.find_all('tr')
country_urls = []
# Go through each row and grab the link. If there's no link, continue to next row
for row in rows:
try:
link = row.select('a')[0]['href']
country_urls.append(link)
except:
continue
# Now iterate through that list
for link in country_urls:
url = "https://bgp.he.net" + link
html = requests.get(url, headers=headers)
country_ID = (url[-2:])
print("\n")
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup)
data = []
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
i = 0
print ('Writing from %s' %(url))
# I added encoding="utf-8" because of an UnicodeEncodeError:
with open ('table_attempt.txt', 'w', encoding="utf-8") as r:
for item in data:
r.write(str(data[i]))
i += 1
r.write("\n")
You can iterate over the main table, and send a request to scrape the "report" listing:
import requests, re
from bs4 import BeautifulSoup as soup
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
def scrape_report(_id):
_d = soup(requests.get(f'https://bgp.he.net/country/{_id}', headers=headers).text, 'html.parser')
_headers = [i.text for i in _d.find_all('th')]
_, *data = [[i.text for i in b.find_all('td')] for b in _d.find_all('tr')]
return [dict(zip(_headers, i)) for i in data]
d = soup(requests.get('https://bgp.he.net/report/world', headers=headers).text, 'html.parser')
_, *_listings = [[re.sub('[\t\n]+', '', i.text) for i in b.find_all('td')] for b in d.find_all('tr')]
final_result = [{**dict(zip(['Name', 'Country', 'ASN'], [a, b, c])), 'data':scrape_report(b)} for a, b, c, *_ in _listings]
import requests
import json
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/report/world"
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
#sorting through table
table = soup.find('table', {'id':'table_countries'})
rows = table.find_all('tr')
country_urls = []
#Grabbing urls from table
for row in rows:
try:
link = row.select('a')[0]['href']
country_urls.append(link)
except:
continue
Total_URLs= len(country_urls)
print(Total_URLs, "counties to pull data from")
print("\n")
#Creating text file
with open('table_attempt.txt', 'w', encoding="utf-8") as r:
json.dumps([])
#Looping through country url list
for link in country_urls:
url = "https://bgp.he.net" + link
html = requests.get(url, headers=headers)
#Taking country identifier from url list
country_ID = (url[-2:])
soup = BeautifulSoup(html.text, 'html.parser')
data = []
i=0
Total_URLs -= 1
#appending to file
with open('ASN_Info.txt', 'a', encoding="utf-8") as r:
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
json.dump(data[i], r)
i += 1
r.write("\n")
print('Currently writing from data from %s. %s countries left to pull data from.' %(country_ID, Total_URLs))