Is it possible to scrape an attribute inside a span? - python

So I want to scrape some phone numbers from a site. The only problem is that they are hidden behind a click. I can't go and click all of them to make them scrape-able so I wanted to ask if there is any way to get them from the 'data-phone' attribute inside the span tag.
I tried to use data_='data-phone' but that doesn't work.
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title, location'])
for number in soup.find_all('span', data_='data-phone'):
print(number)
for info in soup.find_all('div', class_='company-info-top'):
title = info.a.text
location = info.p.text
csv_writer.writerow([title, location])
csv_file.close()

change
for number in soup.find_all('span', data_='data-phone'):
print(number)
to
for number in soup.find_all('span', class_='phone'):
print(number['data-phone'])
Output:
0334226800
0878739737
0334558584
0334798200
0334720311
0334677050
0334554948
0334535384
0337767840
0334560292
0626214363
0334559065
0334506506
0620423525
0334556166
0332012581
0334557485
0334946111
0334536200
0334545111
0334545430
0337851805
033-4721544
06-26662490
To incorporate that into your csv:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress', 'phone'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
try:
phone = search.find('span', {'class':'phone'})['data-phone']
except:
phone = 'N/A'
print(title)
csv_writter.writerow([title,adress,phone])

Related

Scraping only returns header not details

I'm a newbie to python and am just teaching myself how to code and scrape data, hoping someone can explain what Im doing wrong or why from the following script do I only get the headers, but no data is inserted into the text file?
Is it because its returning none or empty data fields from the scrape? or am I missing something in my logic
Not getting any errors from the code it would seem see idle out image below
code
# import necessary libraries
import requests
from bs4 import BeautifulSoup
import csv
import datetime
# get the current date
date = datetime.datetime.now().strftime("%Y%m%d")
# create the output file
filename = 'C:/Users/AJS2/Documents/datafiles_test/' + date + '.txt'
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['url', 'name', 'address', 'phone', 'email'])
# define the base url
base_url = "http://www.wedding-planners.com/index.cfm"
# get the page
page = requests.get(base_url)
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the next button
next_button = soup.find('a', class_='Next')
# set the counter
counter = 0
# loop through the pages
while next_button and counter < 50:
# get the list of wedding planners
wedding_planners = soup.find_all('div', class_='plannerName')
# loop through the list of wedding planners
for planner in wedding_planners:
# get the url
url = planner.a['href']
# get the page
page = requests.get(url)
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the name
name = soup.find('h1', class_='head1').text
# get the address
address = soup.find('span', class_='address').text
# get the phone
phone = soup.find('span', class_='phone').text
# get the email
email = soup.find('span', class_='email').text
# save the data
with open(filename, 'a', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writerow([url, name, address, phone, email])
# increment the counter
counter += 1
# get the next page
page = requests.get(next_button['href'])
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the next button
next_button = soup.find('a', class_='Next')
print('Finished scraping')

After scraping I can not write the text to a text file

I am trying to scrape the prices from a website and it's working but... I can't write the result to a text.file.
this is my python code.
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
price = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
name =("example")
f =open(name + '.txt', "a")
f.write(price.text)
This is not working but if I print it instead of try to write it to a textfile it's working. I have searched for a long time but don't understand it. I think it must be a string to write to a text file but don't know how to change the ouput to a string.
You're getting error due to unicode character.
Try to add encoding='utf-8' property while opening a file.
Also your code gives a bit messy output. Try this instead:
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
rows = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
prices = rows.findAll("span",{"class":"price-holder-row"})
names = rows.findAll("div",{"class":"name-holder"})
price_list = []
name_list = []
for price in prices:
price_list.append(price.text.strip("\n "))
for name in names:
name_list.append(name.text.split()[0])
name =("example")
with open(f"{name}.txt",mode='w', encoding='utf-8') as f:
for name, price in zip(name_list,price_list):
f.write(f"{name}:{price}\n")

Python Web Scraping - How to scrape this type of site?

Okay, so I need to scrape the following webpage: https://www.programmableweb.com/category/all/apis?deadpool=1
It's a list of APIs. There are approx 22,000 APIs to scrape.
I need to:
1) Get the URL of each API in the table (pages 1-889), and also to scrape the following info:
API name
Description
Category
Submitted
2) I then need to scrape a bunch of information from each URL.
3) Export the data to a CSV
The thing is, I’m a bit lost of how to think about this project. From what I can see, there are no AJAX calls been made to populate the table, which means I’m going to have to parse the HTML directly (right?)
In my head, the logic would be something like this:
Use the requests & BS4 libraries to scrape the table
Then, somehow grab the HREF from every row
Access that HREF, scrape the data, move onto the next one
Rinse and repeat for all table rows.
Am I on the right track, is this possible with requests & BS4?
Here's are some screenshots of what I've been trying to explain.
Thank you SOO much for any help. This is hurting my head haha
Here we go using requests, BeautifulSoup and pandas:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.programmableweb.com/category/all/apis?deadpool=1&page='
num = int(input('How Many Page to Parse?> '))
print('please wait....')
name = []
desc = []
cat = []
sub = []
for i in range(0, num):
r = requests.get(f"{url}{i}")
soup = BeautifulSoup(r.text, 'html.parser')
for item1 in soup.findAll('td', attrs={'class': 'views-field views-field-title col-md-3'}):
name.append(item1.text)
for item2 in soup.findAll('td', attrs={'class': 'views-field views-field-search-api-excerpt views-field-field-api-description hidden-xs visible-md visible-sm col-md-8'}):
desc.append(item2.text)
for item3 in soup.findAll('td', attrs={'class': 'views-field views-field-field-article-primary-category'}):
cat.append(item3.text)
for item4 in soup.findAll('td', attrs={'class': 'views-field views-field-created'}):
sub.append(item4.text)
result = []
for item in zip(name, desc, cat, sub):
result.append(item)
df = pd.DataFrame(
result, columns=['API Name', 'Description', 'Category', 'Submitted'])
df.to_csv('output.csv')
print('Task Completed, Result saved to output.csv file.')
Result can be viewed online: Check Here
Output Simple:
Now For href parsing:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.programmableweb.com/category/all/apis?deadpool=0&page='
num = int(input('How Many Page to Parse?> '))
print('please wait....')
links = []
for i in range(0, num):
r = requests.get(f"{url}{i}")
soup = BeautifulSoup(r.text, 'html.parser')
for link in soup.findAll('td', attrs={'class': 'views-field views-field-title col-md-3'}):
for href in link.findAll('a'):
result = 'https://www.programmableweb.com'+href.get('href')
links.append(result)
spans = []
for link in links:
r = requests.get(link)
soup = soup = BeautifulSoup(r.text, 'html.parser')
span = [span.text for span in soup.select('div.field span')]
spans.append(span)
data = []
for item in spans:
data.append(item)
df = pd.DataFrame(data)
df.to_csv('data.csv')
print('Task Completed, Result saved to data.csv file.')
Check Result Online: Here
Sample View is Below:
In Case if you want those 2 csv files together so here's the code:
import pandas as pd
a = pd.read_csv("output.csv")
b = pd.read_csv("data.csv")
merged = a.merge(b)
merged.to_csv("final.csv", index=False)
Online Result: Here
You should read more about scraping if you are going to pursue it .
from bs4 import BeautifulSoup
import csv , os , requests
from urllib import parse
def SaveAsCsv(list_of_rows):
try:
with open('data.csv', mode='a', newline='', encoding='utf-8') as outfile:
csv.writer(outfile).writerow(list_of_rows)
except PermissionError:
print("Please make sure data.csv is closed\n")
if os.path.isfile('data.csv') and os.access('data.csv', os.R_OK):
print("File data.csv Already exists \n")
else:
SaveAsCsv([ 'api_name','api_link','api_desc','api_cat'])
BaseUrl = 'https://www.programmableweb.com/category/all/apis?deadpool=1&page={}'
for i in range(1, 890):
print('## Getting Page {} out of 889'.format(i))
url = BaseUrl.format(i)
res = requests.get(url)
soup = BeautifulSoup(res.text,'html.parser')
table_rows = soup.select('div.view-content > table[class="views-table cols-4 table"] > tbody tr')
for row in table_rows:
tds = row.select('td')
api_name = tds[0].text.strip()
api_link = parse.urljoin(url, tds[0].find('a').get('href'))
api_desc = tds[1].text.strip()
api_cat = tds[2].text.strip() if len(tds) >= 3 else ''
SaveAsCsv([api_name,api_link,api_desc,api_cat])

bs4 python extracting value from <span></span> to .csv printing the same result over and over

I have managed to build a very primitive program to scrape vehicle data from pistonheads and print it to a .csv file with the link, make, model and am working on getting the price which is where I am encountering a problem.
I want to scrape the prices to the fourth column in my .csv file (Price) and to correctly print the prices from each vehicle on the website.
I am only getting it to print the price from one vehicle and repeat it again and again next to each vehicle in the .csv file.
I have tried soup.findAll and soup.find_all to see whether parsing through multiple elements would work but this is just creating a bigger mess.
Might someone be able to help?
I am also trying to scrape the image src and would like to print that on another column (5) called images.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price"])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = soup.find('span')
writer.writerow([link, make, model, price])
print(link, make, model, price)
outfile.close()
You can try this:
import csv, requests, re
from urllib.parse import urlparse
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://www.pistonheads.com/classifieds?Category=used-cars&ResultsPerPage=100').text, 'html.parser')
def extract_details(_s:soup) -> list:
_link = _s.find('a', {'href':re.compile('/classifieds/used\-cars/')})['href']
_, _, make, model, *_ = _link[1:].split('/')
price, img = _s.find('div', {'class':'price'}).text, [i['src'] for i in _s.find_all('img')]
return [_link, make, model, price, 'N/A' if not img else img[0]]
with open('filename.csv', 'w') as f:
_listings = [extract_details(i) for i in d.find_all('div', {'class':'ad-listing'}) if i.find('div', {'class':'price'})]
write = csv.writer(f)
write.writerows([['make', 'model', 'price', 'img'], *_listings])
The reason is because of price = soup.find('span')
.find() will grab the first element it finds. And you have it looking into your soup object. But where you want it to look, is within your a, because that's what you are looping through with for a in links:
I also add .text as I am assuming you just want the text, not the whole tag element. Ie price = a.find('span').text
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price", 'Images'])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = a.find('span').text
image_link = a.parent.parent.find('img')['src']
image = link + image_link
writer.writerow([link, make, model, price, image])
print(link, make, model, price, image)
outfile.close()

How can I scrape multiple pages using Beautiful Soup?

How can I scrape multiple pages from a website? This code is only working for the first one:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
r = requests.get("https://azetshop.ro/12-extensa?page=1")
soup = BeautifulSoup(r.text, "html.parser")
x = soup.find_all("div", "thumbnail")
for thumbnail in x:
descriere = thumbnail.find("h3").text.strip()
pret = thumbnail.find("price").text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])
For multiple pages scraping using BeautifulSoup, many usually do it using while:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
end_page_num = 50
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
i = 1
while i <= end_page_num:
r = requests.get("https://azetshop.ro/12-extensa?page={}".format(i))
soup = BeautifulSoup(r.text, "html5lib")
x = soup.find_all("div", {'class': 'thumbnail-container'})
for thumbnail in x:
descriere = thumbnail.find('h1', {"class": "h3 product-title"}).text.strip()
pret = thumbnail.find('span', {"class": "price"}).text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])
i += 1
Here i will change with increment of 1 as scraping of a page is completed.
This will continue scraping till end_page_num you have defined.
This code works fine too to use class attribute with bs4:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
for i in range(1,50):
r = requests.get("https://azetshop.ro/12-extensa?page="+format(i))
soup = BeautifulSoup(r.text, "html.parser")
array_price= soup.find_all('span', class_='price')
array_desc=soup.find_all('h1', class_='h3 product-title',text=True)
for iterator in range(0,len(array_price)):
descriere = array_desc[iterator].text.strip()
pret = array_price[iterator].text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])

Categories

Resources