I am trying my first time to scrape the information from website and export it to excel file. However, not the whole information is scraped, nor the file is created for the export.
This is what I get in anaconda:
(base) C:\Windows\system32>firstwebscrape.py
brand: []
product_name: ASRock Radeon RX 5700 XT DirectX 12 RX 5700 XT TAICHI X 8G OC+ Video Card
product_price: €446,99
Here is the code
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/global/lt-en/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphic%20card'
#opening up the connection grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML parser
page_soup = soup(page_html, "html.parser")
#grabs all containers
containers = page_soup.findAll("div",{"class":"item-container"})
filename= "123.csv"
f = open(filename, "w")
headers = "brand, product_name, product_price\n"
f.write(headers)
for container in containers:
brand = container.findAll("a",{"class":"title"})
title_container = container.findAll("a",{"class":"item-title"})
product_name = title_container[0].text
price_container = container.findAll("li",{"class":"price-current"})
product_price = price_container[0].text.strip()
print("brand: ", brand)
print("product_name: " + product_name)
print("product_price: " + product_price)
f.write(str(brand) + "," + product_name.replace(",", "|") + "," + product_price + "\n")
f.close()
Your code runs fine. Just correct this in your loop:
for container in containers:
brand = container.findAll("a",{"class":"title"})
title_container = container.findAll("a",{"class":"item-title"})
product_name = title_container[0].text
price_container = container.findAll("li",{"class":"price-current"})
product_price = price_container[0].text.strip()
# these code lines have to be in your for loop!
print("brand: ", brand)
print("product_name: " + product_name)
print("product_price: " + product_price)
f.write(str(brand) + "," + product_name.replace(",", "|") + "," + product_price + "\n")
You want to print and save for every item in your iteration over containers. Otherwise only the last item gets saved to your CSV.
Here is a solution that splits the job between getting the data, extracting and writing the result. It also leaves the job of writing the csv data to the csvmodule.
import csv
import re
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup
def extract_brand(c):
"""Locate brand
At most 3 words in brand
"""
tag = c.find('img', class_='lazy-img')
tmp = tag.get('alt')
m = re.match(r'(\w+\s?){1,3}', tmp)
brand = m.group(0).rstrip() if m else 'No Brand Found'
return brand
def extract_product(c):
title_container = c.find('a', class_='item-title')
product_name = title_container.string
return product_name.replace(',', '|').strip()
def extract_price(c):
price_container = c.find('li', class_='price-current')
tmp = price_container.string
if not tmp:
tmp = ''.join(price_container.stripped_strings)
m = re.match(r'(.\d[\d.,]+)', tmp.strip())
product_price = m.group(0) if m else "?"
return product_price
def extract_from(page):
"""Extract data for each product
Return a list containing data for one product per list item.
"""
containers = page.find_all('div', class_='item-container')
data = []
for container in containers:
item = []
item.append(extract_brand(container))
item.append(extract_product(container))
item.append(extract_price(container))
data.append(item)
return data
def write2csv(filename, data):
with open(filename, 'w', newline='') as csvfile:
fd = csv.writer(csvfile)
headers = ["brand", "product_name", "product_price"]
fd.writerow(headers)
fd.writerows(data)
def get_html_from(url, parser='html.parser'):
with uReq(url) as uClient:
page_html = uClient.read()
page_soup = BeautifulSoup(page_html, parser)
return page_soup
my_url = ('https://www.newegg.com/global/lt-en/Video-Cards-Video-Devices/'
'Category/ID-38?Tpk=graphic%20card')
page = get_html_from(my_url)
data = extract_from(page)
filename = "1234.csv"
write2csv(filename, data)
Related
I am trying to download the zip, which we can also download by clicking on "SCARICA I DATI CSV" on this webpage. I want to do this for 7000+ Italian municipalities by using beautiful soup.
Right now, I have the following code for one city/municipality:
city_name = "vandoies-vintl"
prov_name = "bz"
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')
# this is where the code breaks. because the HTML body does not have any mention of "csv" whatsoever, which is weird.
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
I can't find any mention of csv when checking using print(soup). Could someone please help? Thanks!
The following code works.
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request
import re
import os
import urllib
import zipfile
import re
output_path = r"/Users/aartimalik/Dropbox/delphine-miscellaneous/italy/test"
munis = [("monale", "at"), ("portacomaro", "at")]
munis = pd.DataFrame(munis)
munis.columns = ['municipality_clean','prov_abb']
def remove_paren(string):
return re.sub(r'\(.*\)', '', str(string))
munis['municipality_clean']= munis['municipality_clean'].apply(lambda x: remove_paren(x))
munis['municipality_clean'] = munis['municipality_clean'].str.strip()
munis = munis.replace(' ', '-', regex=True)
munis = munis.apply(lambda x: x.str.lower())
for i in range(0,len(munis)):
city_name = munis.iloc[i]['municipality_clean']
prov_name = munis.iloc[i]['prov_abb']
try:
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
try:
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
#print('Downloaded and extracted zip for ' + city_name + ', ' + prov_name)
print(str(i) + ". " + city_name+": success")
scrapesuccess = scrapesuccess.append(munis.iloc[i])
newfolder= output_path + "/" + city_name.capitalize()
if not os.path.exists(newfolder):
os.makedirs(newfolder)
zip_ref = zipfile.ZipFile(output_path + "/" + city_name + ".zip", 'r')
zip_ref.extractall(newfolder)
zip_ref.close()
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) + ". " + city_name+": fail")
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) + ". " + city_name+": fail")
Heres an example of downloading the zip in memory and writing a city directory with all the csv files.
import urllib.request as request
from io import StringIO
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, **kwargs):
self.url_root = "http://storico.openbilanci.it"
self.city_name = kwargs.get("city_name")
self.prov_name = kwargs.get("prov_name")
def main(self) -> None:
file_link = self.get_link()
zipped_file = self.download_file(file_link)
unzipped_files_mapping = self.unzip_file(zipped_file)
self.write_files(unzipped_files_mapping)
def get_link(self) -> str:
url = f"{self.url_root}/bilanci/{self.city_name}-comune-{self.prov_name}"
response = request.urlopen(url).read()
soup = BeautifulSoup(response, "lxml")
return soup.find_all("a", attrs={"class": "pull-right csv"})[0]["href"]
def download_file(self, zip_link: str) -> str:
url = f"{self.url_root}{zip_link}"
return request.urlretrieve(url)[0]
#staticmethod
def unzip_file(file_handle: str) -> dict:
zip_file_object = ZipFile(file_handle, "r")
files = zip_file_object.namelist()
return {
file: pd.read_csv(StringIO(zip_file_object.open(file).read().decode("utf-8")), sep=";")
for file in files
}
def write_files(self, file_mapping: dict) -> None:
for file, df in file_mapping.items():
file_path, file_name = file.rsplit("/", 1)
path = Path(f"/path/to/files/{self.city_name}/{file_path}")
path.mkdir(parents=True, exist_ok=True)
df.to_csv(f"{path}/{file_name}")
city_name = "vandoies-vintl"
prov_name = "bz"
Scraper(city_name=city_name, prov_name=prov_name).main()
I was trying to scrape some data using BeautifulSoup on python from a site which has some products and then store it in text files in separate folders. Here in the given code I am stuck nearing the end of the same. I have added the rest just as a reference.
import unittest, time, random
import urllib.request
import os
from selenium import webdriver
from selenium.common.exceptions import InvalidArgumentException
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
links = []
soup_list = []
imgs = []
website = "https://www.energystar.gov/productfinder"
rate = [i/10 for i in range(10)]
cnt = 0
quote = '"'
newline = '\n'
colon = ' : '
browser = webdriver.Firefox(executable_path="C:\\Users\\abc\\.wdm\\drivers\\geckodriver\\win64\\v0.29.1\\geckodriver.exe")
url2 = 'https://www.energystar.gov/productfinder/product/certified-room-air-cleaners/results?page_number='
def getdata(url):
browser.get(url)
content = browser.page_source
soup1 = BeautifulSoup(content, "html.parser")
return soup1
#pagenos = ['0','13']
pagenos = []
for i in range(0,2):
pagenos.append(i)
i =+ 1
print(pagenos)
for i in range(0,len(pagenos)):
url = url2 + str(pagenos[i])
soup1 = getdata(url)
soup_list.append(soup1)
for main in soup1.findAll('div', attrs = {'class' : 'row certified-room-air-cleaners'}):
name=main.find('a', href=True)
if (name != ''):
links.append((name.get('href')).strip())
print("Got links : ", len(links))
print("Got soups : ", len(soup_list))
#print('Soup 1:', soup_list[1])
for link in links:
#just for testing 10 links
cnt = cnt + 1
if cnt >= 20:
break
# time delay before we access the next page..
time.sleep(random.choice(rate))
#print("Fetching link..... : ", link)
link = link[5:]
#print("Fetching link..... : ", link)
link = website + link
browser.get(link)
linkcontent = browser.page_source
soup2 = BeautifulSoup(linkcontent, "html.parser")
pmname = soup2.find('div', attrs={'class' : 'l-wrapper'}).find('h1')
if not pmname:
print("Error no product name for link : ", link)
continue
pmname = pmname.text.strip().split(' - ')
bname = pmname[0].strip()
mname = pmname[1].strip()
#print(bname)
#print(mname)
# Creating folder with Brand Name as name
try:
os.makedirs(str(bname))
except FileExistsError:
# directory already exists
pass
# Creating text folders with model number as name
fname = mname + '.txt'
path = '/' + str(bname)
a = os.path.join(path, fname)
print(a)
ff = open(fname, mode='w')
ff.write("BRAND NAME : " + quote + bname + quote + newline)
ff.write("MODEL : " + quote + mname + quote + newline)
browser.close()
exit()
Here I can successfully create the text file but cant seem to add it the the created path.
You are missing something:
If you are adding anything in list then you must have in write mode to write file but you want to append in exist data then file open must have "a" mode to append data at end and before to write any data inside file. file must be open first then close at end of file to close process of file after finish.
I tried to create a web crawler, unfortunately an error occurred during execution. This web crawler should read the data from the website. I get the title from the picture.
brand = make_rating_sp[0].img["title"].title()
TypeError: 'NoneType' object is not subscriptable
Why is this error and how can I fix it?
I would be very happy to receive an answer.
pip install urlopen
pip install beautifulsoup4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
containers = page_soup.findAll("div", {"class": "item-container"})
out_filename = "graphics_cards.csv"
headers = "brand,product_name,shipping \n"
f = open(out_filename, "w")
f.write(headers)
for container in containers:
divWithInfo = containers[1].find("div","item-info")
make_rating_sp = container.div.select("a")
brand = make_rating_sp[2].img["title"].title()
product_name = container.div.select("a")[2].text
shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")
print("brand: " + brand + "\n")
print("product_name: " + product_name + "\n")
print("shipping: " + shipping + "\n")
f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n")
f.close() # Close the file
I've just tried to make a beautifulsoup file for aliexpress, even though the file runs on command properly, I couldn't create a folder.
Can you please help me?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.aliexpress.com/category/200003482/dresses.html?spm=2114.11010108.101.3.650c649b3Cw8J9&g=y'
#opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, 'html.parser')
#grabs each product
containers = page_soup.findAll('div',{'class':'item'})
filename = 'product.csv'
f = open(filename, 'w')
headers = 'product_name, item_price, store_name\n'
f.write(headers)
contain = containers[0]
container = containers[0]
for container in containers:
product_name = container.h3.a.text
product_container = container.findAll('span',{'class':'price price-m'})
price_container = product_container[0].findAll('span',{'class':'value'})
item_price = price_container[0].text
store_container = container.findAll('div',{'class':'info-more'})
store_container[0].findAll('div',{'class':'store-name util-clearfix'})
name_container = store_container[0].findAll('div',{'class':'store-name util-clearfix'})
store_name = name_container[0].a.text
print('product_name: ' + product_name)
print('item_price: ' + item_price)
print('store_name: ' + store_name)
f.write(product_name + ',' + item_price + ',' store_name + '\n')
f.close()
I am using the following code:
import urllib
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
years = list(range(1956,2016))
for year in years:
my_urls = ('http://www.hitparadeitalia.it/hp_yends/hpe' + str(year) + '.htm',)
my_url = my_urls[0]
for my_url in my_urls:
uClient = uReq(my_url)
html_input = uClient.read()
uClient.close()
page_soup = BeautifulSoup(html_input, "html.parser")
container = page_soup.findAll("li")
filename = "singoli" + str(year) + ".csv"
f = open(filename, "w")
headers = "lista"
f.write(headers)
lista = container[0].text
print("lista: " + lista)
f.write(lista + "\n")
f.close()
I get text that does not seem to be in the "li" container but it gets written in the outputs. This is the unwanted text:
<!--
google_ad_client = "ca-pub-9635531430093553";
/* in medias res */
google_ad_slot = "9880694813";
google_ad_width = 468;
google_ad_height = 60;
//-->
How can I get rid of it?
The text you don't want is coming from a script element. So get rid of the script elements before you start and it works:
import urllib
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
years = list(range(1956,2016))
for year in years:
my_urls = ('http://www.hitparadeitalia.it/hp_yends/hpe' + str(year) + '.htm',)
my_url = my_urls[0]
for my_url in my_urls:
uClient = uReq(my_url)
html_input = uClient.read()
uClient.close()
page_soup = BeautifulSoup(html_input, "html.parser")
[s.extract() for s in page_soup('script')]
container = page_soup.findAll("li")
filename = "singoli" + str(year) + ".csv"
f = open(filename, "w")
headers = "lista"
f.write(headers)
lista = container[0].text
print("lista: " + lista)
f.write(lista + "\n")
f.close()
All I did was to add the one line:
[s.extract() for s in page_soup('script')]
Which finds the script elements and removes them.