Python & Beautifulsoup 4 - Scraping code optimization - python

I'm attempting to scrape multiple websites for specific products and I'm sure there is a way to optimize my code. As of right now, the code does it's job but this is really not the Pythonic way to go about it(I am a Python novice so please excuse my lack of knowledge).
The goal of this program is to get the prices of the products from the URLs provided and write them to a .csv file. Each website has a different structure, but I am always using the same 3 websites. This is an example of my current code:
import requests
import csv
import io
import os
from datetime import datetime
from bs4 import BeautifulSoup
timeanddate=datetime.now().strftime("%Y%m%d-%H%M%S")
folder_path =
'my_folder_path'
file_name = 'product_prices_'+timeanddate+'.csv'
full_name = os.path.join(folder_path, file_name)
with io.open(full_name, 'w', newline='', encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["ProductTitle", "Website1", "Website2", "Website3"])
#---Product 1---
#Website1 price
website1product1 = requests.get('website1product1URL')
website1product1Data = BeautifulSoup(website1product1.text, 'html.parser')
website1product1Price = website1product1Data.find('div', attrs={'class': 'price-final'}).text.strip()
print(website1product1Price)
#Website2 price
website2product1 = requests.get('website2product1URL')
website2product1Data = BeautifulSoup(website2product1.text, 'html.parser')
website2product1Price = website2product1Data.find('div', attrs={'class': 'price_card'}).text.strip()
print(website2product1Price)
#Website3 price
website3product1 = requests.get('website3product1URL')
website3product1Data = BeautifulSoup(website3product1.text, 'html.parser')
website3product1Price = website3product1Data.find('strong', attrs={'itemprop': 'price'}).text.strip()
print(website3product1Price)
writer.writerow(["ProductTitle", website1product1Price, website2product1Price, website3product1Price])
file.close()
It saves the ProductTitles and Prices to a .csv in this format and I'd like to keep this format:
#Header
ProductTitle Website1 Website2 Website3
#Scraped data
Product1 $23 $24 $52
This is manageable for a few products, but I'd like to have hundreds and copying the same lines of code and changing variable names is confusing, tedious and is bound to be riddled with human error.
Can I create a function that takes 3 URLs as arguments and outputs the website1product1Price, website2product1Price and website2product1Price, and call that function once per product? Can it then be wrapped in a loop to go through a list of URLs and still keep the original formatting?
Any help is appreciated.

Is this could be a solution for you?
Admitting you have an array of dict for your product:
products = [
{
'name': 'product1',
'url1': 'https://url1',
'url2': 'https://url2',
'url3': 'https://url3'
}
]
Your code could be something like this:
import requests
import csv
import io
import os
from datetime import datetime
from bs4 import BeautifulSoup
def get_product_prices(product):
#---Product 1---
#Website1 price
website1product1 = requests.get(product['url1'])
website1product1Data = BeautifulSoup(website1product1.text, 'html.parser')
website1product1Price = website1product1Data.find('div', attrs={'class': 'price-final'}).text.strip()
#Website2 price
website2product1 = requests.get(product['url2'])
website2product1Data = BeautifulSoup(website2product1.text, 'html.parser')
website2product1Price = website2product1Data.find('div', attrs={'class': 'price_card'}).text.strip()
#Website3 price
website3product1 = requests.get(product['url3'])
website3product1Data = BeautifulSoup(website3product1.text, 'html.parser')
website3product1Price = website3product1Data.find('strong', attrs={'itemprop': 'price'}).text.strip()
return website1product1Price, website2product1Price, website3product1Price
timeanddate=datetime.now().strftime("%Y%m%d-%H%M%S")
folder_path =
'my_folder_path'
file_name = 'product_prices_'+timeanddate+'.csv'
full_name = os.path.join(folder_path, file_name)
with io.open(full_name, 'w', newline='', encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["ProductTitle", "Website1", "Website2", "Website3"])
for product in products:
price1, price2, price3 = get_product_prices(product)
write.writerow(product['name'], price1, price2, price3)
file.close()

You can create a function and pass everything as parameter like url, tag_name , attribute_name and attribute_value.see if this help.
def price_text(url_text,ele_tag,ele_attr,attrval):
website1product1 = requests.get(url_text)
website1product1Data = BeautifulSoup(website1product1.text, 'html.parser')
website1product1Price=website1product1Data.find("'" + ele_tag + "'", attrs="{'" + ele_attr + "': '" + attrval + "'}").text.strip()
print(website1product1Price)
website1product1Price=price_text("url","div","class","price-final")
website1product2Price=price_text("url","div","class","price_card")
website1product3Price=price_text("url","strong","itemprop","price")

Related

Attempting to retrieve text from <td></td> tags using BeautifulSoup

So I'm using BeautifulSoup to scrape the link in the code. The artist names and the links come out fine, but I'm not sure how to access the nationality in that second tag.
Here's the code:
import requests
import csv
from bs4 import BeautifulSoup
def findName():
page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anB1.htm')
soup = BeautifulSoup(page.text, 'html.parser')
last_links = soup.find(class_='AlphaNav')
last_links.decompose()
f = csv.writer(open('h-artist_lastname.csv', 'w')) # Create a file to write
f.writerow(['Last Name, First Name', 'Nationality', 'Link'])
artist_name_list = soup.find(class_='BodyText')
artist_name_list_items = artist_name_list.find_all('a')
artist_nationality_list_items = artist_name_list.find_all('td')
print(artist_nationality_list_items)
for artist_name in artist_name_list_items:
names = artist_name.contents[0]
#nationalities = artist_nationality_list_items.contents[0]
links = 'https://web.archive.org' + artist_name.get('href')
#print(nationalities)
f.writerow([names, links])
findName()
If I uncomment the line in the for loop, I get a runtime error which I expect. The print statement gives me this value for artist_nationality_list_items:
<td>Babbitt, Platt D.</td>, <td>American, died 1879</td>, ..... <- follows this pattern for every artist
Basically, I want the part with 'American, died 1879'.
You can use select which accepts CSS selectors with :nth-child() to select second <td> in each <tr> instead of find_all, so this:
artist_nationality_list_items = artist_name_list.find_all('td')
becomes:
artist_nationality_list_items = artist_name_list.select('td:nth-child(2)')
You can still work with contents, but don't get bogged down with all the lists - Select your target more specific and get all information with more flow.
What happens?
You're treating artist_nationality_list_items (a list) like a single element, that wont work.
How to fix?
To get the right result from your artist_nationality_list_items you have to iterate it too.
(Works, but bad idea):
for i,artist_name in enumerate(artist_name_list_items):
names = artist_name.contents[0]
nationalities = artist_nationality_list_items[i+1].contents[0]
links = 'https://web.archive.org' + artist_name.get('href')
Alternativ and much leaner approach
import requests, csv
from bs4 import BeautifulSoup
def findName():
page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anB1.htm')
soup = BeautifulSoup(page.text, 'html.parser')
f = csv.writer(open('h-artist_lastname.csv', 'w')) # Create a file to write
f.writerow(['Last Name, First Name', 'Nationality', 'Link'])
for row in soup.select('div.BodyText h3+table tr'):
names = row.contents[0].text
nationalities = row.contents[1].text
links = 'https://web.archive.org' + row.a.get('href')
#print([names,nationalities,links])
f.writerow([names,nationalities,links])
findName()
A little bit of a botched answer with some sloppy workarounds, but this resulted in what I needed:
import requests
import csv
from bs4 import BeautifulSoup
def findName():
page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anB1.htm')
soup = BeautifulSoup(page.text, 'html.parser')
last_links = soup.find(class_='AlphaNav')
last_links.decompose()
f = csv.writer(open('b-artist_lastname.csv', 'w')) # Create a file to write
f.writerow(['Last Name, First Name', 'Nationality', 'Link'])
artist_name_list = soup.find(class_='BodyText')
artist_name_list_items = artist_name_list.find_all('a')
i = 2
for artist_name in artist_name_list_items:
str_list = list('td:nth-of-type(i)')
str_list[15] = str(i)
selection = "".join(str_list)
names = artist_name.contents[0]
nationality = artist_name_list.select(selection)
links = 'https://web.archive.org' + artist_name.get('href')
nat_to_str = str(nationality)
nat_str_final = nat_to_str[5:len(nat_to_str) - 6]
#print(nat_str_final)
f.writerow([names, nat_str_final, links])
i += 2
findName()
Thank you to everyone who answered. Using 'td:nth-of-type()' seemed to work but for me to get every artist on the page, I would need to increase the value inside of nth-of-type every time so I used a list of chars and converted them into a string after incrementing I at each traversal.

Writing scraped data in rows with python

I have a basic bs4 web scraper, There are no issues in getting my scrape data, but when I try to write it to a .csv file, I got some problems. I am unable to write my data to more than one column. In the tutorial I kinda follow, he can separate rows with "," easily but when I open my CSV with excel, neither in the header nor in data there is a separation, what am I missing?
import requests
from bs4 import BeautifulSoup
url="myurl"
page=requests.get(url)
soup=BeautifulSoup(page.content,'html.parser')
items=soup.find_all('a', class_='listing-card')
filename = 'data.csv'
f = open(filename, "w")
header = "name, price\n"
f.write(header)
for item in items:
title = item.find('span', class_='title').text
price = item.find('span', class_='price').text
f.write(title.replace(",","|") + ',' + price + "\n")
f.close()
Another method.
from simplified_scrapy import SimplifiedDoc, utils, req
url = "myurl"
html = req.get(url)
rows = []
rows.append(['name', 'price']) # Add header
doc = SimplifiedDoc(html)
items = doc.getElements('a', attr='class', value='listing-card') # Get all nodes a according to the class
for item in items:
title = item.getElement('span', value='title').text
price = item.getElement('span', value='price').text
rows.append([title, price])
utils.save2csv('data.csv', rows) # Save to CSV file
Here are more examples: https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
I have found that the easiest way to get your data into a CSV file is to put the data into a pandas DataFrame then use the to_csv method to write the file.
Using your example the code would be as follows:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url="myurl"
page=requests.get(url)
soup=BeautifulSoup(page.content,'html.parser')
items=soup.find_all('a', class_='listing-card')
filename = 'data.csv'
f = open(filename, "w")
header = "name, price\n"
f.write(header)
#
# Create an empty list to store entries
mylist = []
for item in items:
title = item.find('span', class_='title').text
price = item.find('span', class_='price').text
#
# Create the dictionary item to be appended to the list
entry = {'name' : title, 'price' : price}
mylist.append(entry)
myDataframe = pd.DataFrame(mylist)
myDataframe.to_csv('CSV_file.csv')

bs4 python extracting value from <span></span> to .csv printing the same result over and over

I have managed to build a very primitive program to scrape vehicle data from pistonheads and print it to a .csv file with the link, make, model and am working on getting the price which is where I am encountering a problem.
I want to scrape the prices to the fourth column in my .csv file (Price) and to correctly print the prices from each vehicle on the website.
I am only getting it to print the price from one vehicle and repeat it again and again next to each vehicle in the .csv file.
I have tried soup.findAll and soup.find_all to see whether parsing through multiple elements would work but this is just creating a bigger mess.
Might someone be able to help?
I am also trying to scrape the image src and would like to print that on another column (5) called images.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price"])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = soup.find('span')
writer.writerow([link, make, model, price])
print(link, make, model, price)
outfile.close()
You can try this:
import csv, requests, re
from urllib.parse import urlparse
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://www.pistonheads.com/classifieds?Category=used-cars&ResultsPerPage=100').text, 'html.parser')
def extract_details(_s:soup) -> list:
_link = _s.find('a', {'href':re.compile('/classifieds/used\-cars/')})['href']
_, _, make, model, *_ = _link[1:].split('/')
price, img = _s.find('div', {'class':'price'}).text, [i['src'] for i in _s.find_all('img')]
return [_link, make, model, price, 'N/A' if not img else img[0]]
with open('filename.csv', 'w') as f:
_listings = [extract_details(i) for i in d.find_all('div', {'class':'ad-listing'}) if i.find('div', {'class':'price'})]
write = csv.writer(f)
write.writerows([['make', 'model', 'price', 'img'], *_listings])
The reason is because of price = soup.find('span')
.find() will grab the first element it finds. And you have it looking into your soup object. But where you want it to look, is within your a, because that's what you are looping through with for a in links:
I also add .text as I am assuming you just want the text, not the whole tag element. Ie price = a.find('span').text
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price", 'Images'])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = a.find('span').text
image_link = a.parent.parent.find('img')['src']
image = link + image_link
writer.writerow([link, make, model, price, image])
print(link, make, model, price, image)
outfile.close()

how to loop using beautifulsoup

I am trying to scrape data on car model, price, mileage, location, etc using beautifulsoup. However, the return result only reports data on one random car. I want to be able to collect data on all cars advertised on the site to date. My python code is below. How can I modify my code to retrieve data such that each day I have information on car model, price, mileage, location, etc? Example:
Car model price mileage location date
Toyota Corrola $4500 22km Accra 16/02/2018
Nissan Almera $9500 60km Tema 16/02/2018
etc
import requests
from bs4 import BeautifulSoup
import pandas
import csv
from datetime import datetime
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
print soup.prettify()
data = soup.find(class_='item-content')
for tag in data:
item_title = data.find("a",attrs={"class":"item-title h4"})
model = item_title.text.encode('utf-8').strip()
item_meta = data.find("p",attrs={"class":"item-meta"})
mileage = item_meta.text.encode('utf-8').strip()
item_location = data.find("p",attrs={"class":"item-location"})
location = item_location.text.encode('utf-8').strip()
item_info = data.find("p",attrs={"class":"item-info"})
price = item_info.text.encode('utf-8').strip()
with open('example.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([model, price, mileage, location, datetime.now()])
First off, this loop:
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
is not doing what I assume you think it is. This loop simply resets the url 300 times and leaves you with the original url you set. You need to wrap all your code in this loop to ensure you are hitting each of the URLs you want (1-300).
Restructure your code (paying attention to indents!) so that the next url is the one being used in the request:
# This will print ALOT of titles
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?" + str(i)
print(url) # Notice how the url changes with each iteration?
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
titles = soup.findAll("a",attrs={"class":"item-title h4"})
for item in titles:
currTitle = item.text.encode('utf-8').strip()
print(currTitle)
This code:
import requests
from bs4 import BeautifulSoup as bsoup
url = "https://tonaton.com/en/ads/ghana/cars?1"
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
titles = soup.findAll("a",attrs={"class":"item-title h4"})
for item in titles:
print(item.text.encode('utf-8').strip())
Yields (not sure what the 'b' is doing):
b'Hyundai Veloster 2013'
b'Ford Edge 2009'
b'Mercedes-Benz C300 2016'
b'Mazda Demio 2007'
b'Hyundai Santa fe 2005'
# And so on...
The problem is that 1) if you call find(), it will stop after you find the first match given your params. Using findAll() will dump all matches into a list which you then can iterate through and process as needed. And 2) the result you get from a call to find() is a broken structure of the original HTML. Thus the next find() calls won't work.
import requests
from bs4 import BeautifulSoup as bsoup
import csv
from datetime import datetime
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
item_title = soup.findAll("a",attrs={"class":"item-title h4"})
for item in item_title:
model = item.text.encode('utf-8').strip()
item_meta = soup.findAll("p",attrs={"class":"item-meta"})
for item in item_meta:
milleage = item.text.encode('utf-8').strip()
item_location = soup.findAll("p",attrs={"class":"item-location"})
for item in item_location:
location = item.text.encode('utf-8').strip()
item_info = soup.findAll("p",attrs={"class":"item-info"})
for item in item_info:
price = item.text.encode('utf-8').strip()
with open('index.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([model, price, milleage, location, datetime.now()])

Trouble dealing with header in a csv file

I've written some code using python to scrape some titles and price from a webpage and write the results in a csv file. The script is running awesome. As I'm appending data to a csv file the script is writing headers in such a way that if it runs 4 loops then the headers will be written 4 times. How to fix it so that the headers will be written once. Thanks.
This is the script:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
for link in diction_page:
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
with open('item.csv','a',newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(["Title","Price"])
writer.writerow([title, price])
As an option you can try this:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
for i,link in enumerate(diction_page):
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
with open('item.csv','a',newline='') as outfile:
writer = csv.writer(outfile)
if (i == 0):
writer.writerow(["Title","Price"])
writer.writerow([title, price])
Don't write the headers in the for loop:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
outfile = open('item.csv','w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Title","Price"])
for link in diction_page:
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
writer.writerow([title, price])
outfile.close()

Categories

Resources