Unable to add text file to path in python - python

I was trying to scrape some data using BeautifulSoup on python from a site which has some products and then store it in text files in separate folders. Here in the given code I am stuck nearing the end of the same. I have added the rest just as a reference.
import unittest, time, random
import urllib.request
import os
from selenium import webdriver
from selenium.common.exceptions import InvalidArgumentException
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
links = []
soup_list = []
imgs = []
website = "https://www.energystar.gov/productfinder"
rate = [i/10 for i in range(10)]
cnt = 0
quote = '"'
newline = '\n'
colon = ' : '
browser = webdriver.Firefox(executable_path="C:\\Users\\abc\\.wdm\\drivers\\geckodriver\\win64\\v0.29.1\\geckodriver.exe")
url2 = 'https://www.energystar.gov/productfinder/product/certified-room-air-cleaners/results?page_number='
def getdata(url):
browser.get(url)
content = browser.page_source
soup1 = BeautifulSoup(content, "html.parser")
return soup1
#pagenos = ['0','13']
pagenos = []
for i in range(0,2):
pagenos.append(i)
i =+ 1
print(pagenos)
for i in range(0,len(pagenos)):
url = url2 + str(pagenos[i])
soup1 = getdata(url)
soup_list.append(soup1)
for main in soup1.findAll('div', attrs = {'class' : 'row certified-room-air-cleaners'}):
name=main.find('a', href=True)
if (name != ''):
links.append((name.get('href')).strip())
print("Got links : ", len(links))
print("Got soups : ", len(soup_list))
#print('Soup 1:', soup_list[1])
for link in links:
#just for testing 10 links
cnt = cnt + 1
if cnt >= 20:
break
# time delay before we access the next page..
time.sleep(random.choice(rate))
#print("Fetching link..... : ", link)
link = link[5:]
#print("Fetching link..... : ", link)
link = website + link
browser.get(link)
linkcontent = browser.page_source
soup2 = BeautifulSoup(linkcontent, "html.parser")
pmname = soup2.find('div', attrs={'class' : 'l-wrapper'}).find('h1')
if not pmname:
print("Error no product name for link : ", link)
continue
pmname = pmname.text.strip().split(' - ')
bname = pmname[0].strip()
mname = pmname[1].strip()
#print(bname)
#print(mname)
# Creating folder with Brand Name as name
try:
os.makedirs(str(bname))
except FileExistsError:
# directory already exists
pass
# Creating text folders with model number as name
fname = mname + '.txt'
path = '/' + str(bname)
a = os.path.join(path, fname)
print(a)
ff = open(fname, mode='w')
ff.write("BRAND NAME : " + quote + bname + quote + newline)
ff.write("MODEL : " + quote + mname + quote + newline)
browser.close()
exit()
Here I can successfully create the text file but cant seem to add it the the created path.

You are missing something:
If you are adding anything in list then you must have in write mode to write file but you want to append in exist data then file open must have "a" mode to append data at end and before to write any data inside file. file must be open first then close at end of file to close process of file after finish.

Related

Downloading Zip file from a webpage

I am trying to download the zip, which we can also download by clicking on "SCARICA I DATI CSV" on this webpage. I want to do this for 7000+ Italian municipalities by using beautiful soup.
Right now, I have the following code for one city/municipality:
city_name = "vandoies-vintl"
prov_name = "bz"
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')
# this is where the code breaks. because the HTML body does not have any mention of "csv" whatsoever, which is weird.
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
I can't find any mention of csv when checking using print(soup). Could someone please help? Thanks!
The following code works.
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request
import re
import os
import urllib
import zipfile
import re
output_path = r"/Users/aartimalik/Dropbox/delphine-miscellaneous/italy/test"
munis = [("monale", "at"), ("portacomaro", "at")]
munis = pd.DataFrame(munis)
munis.columns = ['municipality_clean','prov_abb']
def remove_paren(string):
return re.sub(r'\(.*\)', '', str(string))
munis['municipality_clean']= munis['municipality_clean'].apply(lambda x: remove_paren(x))
munis['municipality_clean'] = munis['municipality_clean'].str.strip()
munis = munis.replace(' ', '-', regex=True)
munis = munis.apply(lambda x: x.str.lower())
for i in range(0,len(munis)):
city_name = munis.iloc[i]['municipality_clean']
prov_name = munis.iloc[i]['prov_abb']
try:
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
try:
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
#print('Downloaded and extracted zip for ' + city_name + ', ' + prov_name)
print(str(i) + ". " + city_name+": success")
scrapesuccess = scrapesuccess.append(munis.iloc[i])
newfolder= output_path + "/" + city_name.capitalize()
if not os.path.exists(newfolder):
os.makedirs(newfolder)
zip_ref = zipfile.ZipFile(output_path + "/" + city_name + ".zip", 'r')
zip_ref.extractall(newfolder)
zip_ref.close()
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) + ". " + city_name+": fail")
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) + ". " + city_name+": fail")
Heres an example of downloading the zip in memory and writing a city directory with all the csv files.
import urllib.request as request
from io import StringIO
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, **kwargs):
self.url_root = "http://storico.openbilanci.it"
self.city_name = kwargs.get("city_name")
self.prov_name = kwargs.get("prov_name")
def main(self) -> None:
file_link = self.get_link()
zipped_file = self.download_file(file_link)
unzipped_files_mapping = self.unzip_file(zipped_file)
self.write_files(unzipped_files_mapping)
def get_link(self) -> str:
url = f"{self.url_root}/bilanci/{self.city_name}-comune-{self.prov_name}"
response = request.urlopen(url).read()
soup = BeautifulSoup(response, "lxml")
return soup.find_all("a", attrs={"class": "pull-right csv"})[0]["href"]
def download_file(self, zip_link: str) -> str:
url = f"{self.url_root}{zip_link}"
return request.urlretrieve(url)[0]
#staticmethod
def unzip_file(file_handle: str) -> dict:
zip_file_object = ZipFile(file_handle, "r")
files = zip_file_object.namelist()
return {
file: pd.read_csv(StringIO(zip_file_object.open(file).read().decode("utf-8")), sep=";")
for file in files
}
def write_files(self, file_mapping: dict) -> None:
for file, df in file_mapping.items():
file_path, file_name = file.rsplit("/", 1)
path = Path(f"/path/to/files/{self.city_name}/{file_path}")
path.mkdir(parents=True, exist_ok=True)
df.to_csv(f"{path}/{file_name}")
city_name = "vandoies-vintl"
prov_name = "bz"
Scraper(city_name=city_name, prov_name=prov_name).main()

Beautiful Soup Object Still Works After the Requests Object it is Attached to Changes

I have some code that allows for the downloading of various comics off of xkcd. This code is gathered from Al Sweigart's book: Automate The Boring Stuff With Python with some minor edits made by me.
I understand most of what is going on. What's confusing id that the 'soup' BeautifulSoup object is made from a request named 'r' continues to get information from the page that can be used throughout the code even though 'r' is re-instantiated in the function 'download_image()'.
Even more confusing is that if the 'r' found in 'download_image()' is renamed to something other than 'r', the code will break.
Code:
import requests
import os
import bs4
os.makedirs('xkcd', exist_ok=True)
page = input('What issue of xkcd would you like to download? (*all for all comics, *today for today\'s comic): ')
url = 'http://xkcd.com/'
def download_image():
comic_url = 'http:' + comic[0].get('src') # page with just the image
r = requests.get(comic_url) # switches to that page
# gets file with directory xkcd/name of comic
try:
issue_number = str(int(str(soup.select('a[rel="prev"]')[0].get('href'))[1:-1]) + 1)
except ValueError:
issue_number = '1'
name = os.path.basename(comic_url[:-4] + "_" + issue_number + ".png")
file = open(os.path.join('xkcd', name), 'wb')
print("Downloading image %s... " % name)
# writes to file
for chunk in r.iter_content(100000):
file.write(chunk)
file.close()
if page == '*all':
url = 'http://xkcd.com/5'
while not url.endswith('#'):
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
comic = soup.select('#comic img')
download_image()
prev_link = soup.select('a[rel="prev"]')[0]
url = 'http://xkcd.com/' + prev_link.get('href')
else:
if page == '*today':
page = ''
r = requests.get(url + page)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
comic = soup.select('#comic img')
if not comic:
print("Comic not found.")
else:
download_image()
"""
r = requests.get('https://imgs.xkcd.com/comics/python.png')
# makes file and write the file in bytes to it
with open('comic.png', 'wb') as f:
f.write(r.content)
"""
Does anyone know why the soup variable continues to work after re-defining the r variable?

Scraping does not export to Excel

I am trying my first time to scrape the information from website and export it to excel file. However, not the whole information is scraped, nor the file is created for the export.
This is what I get in anaconda:
(base) C:\Windows\system32>firstwebscrape.py
brand: []
product_name: ASRock Radeon RX 5700 XT DirectX 12 RX 5700 XT TAICHI X 8G OC+ Video Card
product_price: €446,99 
Here is the code
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/global/lt-en/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphic%20card'
#opening up the connection grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML parser
page_soup = soup(page_html, "html.parser")
#grabs all containers
containers = page_soup.findAll("div",{"class":"item-container"})
filename= "123.csv"
f = open(filename, "w")
headers = "brand, product_name, product_price\n"
f.write(headers)
for container in containers:
brand = container.findAll("a",{"class":"title"})
title_container = container.findAll("a",{"class":"item-title"})
product_name = title_container[0].text
price_container = container.findAll("li",{"class":"price-current"})
product_price = price_container[0].text.strip()
print("brand: ", brand)
print("product_name: " + product_name)
print("product_price: " + product_price)
f.write(str(brand) + "," + product_name.replace(",", "|") + "," + product_price + "\n")
f.close()
Your code runs fine. Just correct this in your loop:
for container in containers:
brand = container.findAll("a",{"class":"title"})
title_container = container.findAll("a",{"class":"item-title"})
product_name = title_container[0].text
price_container = container.findAll("li",{"class":"price-current"})
product_price = price_container[0].text.strip()
# these code lines have to be in your for loop!
print("brand: ", brand)
print("product_name: " + product_name)
print("product_price: " + product_price)
f.write(str(brand) + "," + product_name.replace(",", "|") + "," + product_price + "\n")
You want to print and save for every item in your iteration over containers. Otherwise only the last item gets saved to your CSV.
Here is a solution that splits the job between getting the data, extracting and writing the result. It also leaves the job of writing the csv data to the csvmodule.
import csv
import re
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup
def extract_brand(c):
"""Locate brand
At most 3 words in brand
"""
tag = c.find('img', class_='lazy-img')
tmp = tag.get('alt')
m = re.match(r'(\w+\s?){1,3}', tmp)
brand = m.group(0).rstrip() if m else 'No Brand Found'
return brand
def extract_product(c):
title_container = c.find('a', class_='item-title')
product_name = title_container.string
return product_name.replace(',', '|').strip()
def extract_price(c):
price_container = c.find('li', class_='price-current')
tmp = price_container.string
if not tmp:
tmp = ''.join(price_container.stripped_strings)
m = re.match(r'(.\d[\d.,]+)', tmp.strip())
product_price = m.group(0) if m else "?"
return product_price
def extract_from(page):
"""Extract data for each product
Return a list containing data for one product per list item.
"""
containers = page.find_all('div', class_='item-container')
data = []
for container in containers:
item = []
item.append(extract_brand(container))
item.append(extract_product(container))
item.append(extract_price(container))
data.append(item)
return data
def write2csv(filename, data):
with open(filename, 'w', newline='') as csvfile:
fd = csv.writer(csvfile)
headers = ["brand", "product_name", "product_price"]
fd.writerow(headers)
fd.writerows(data)
def get_html_from(url, parser='html.parser'):
with uReq(url) as uClient:
page_html = uClient.read()
page_soup = BeautifulSoup(page_html, parser)
return page_soup
my_url = ('https://www.newegg.com/global/lt-en/Video-Cards-Video-Devices/'
'Category/ID-38?Tpk=graphic%20card')
page = get_html_from(my_url)
data = extract_from(page)
filename = "1234.csv"
write2csv(filename, data)

Extracting strong text and following p's

I have written a code to extract a div (see below), but now i would like to show all the "strong" in one column and the following text in a different column (for multiple files in a directory). In dropbox i uploaded an example: (https://www.dropbox.com/s/kbnal2pefih2ru4/test.html?dl=0).
My code till this far is:
import textwrap
import os
from bs4 import BeautifulSoup
directory ='C:/Research syntheses - Meta analysis/SeekingAlpha/Tests/'
for filename in os.listdir(directory):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname, 'r') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
print(filename, participants)
So my output would need to be: in column 1 all the strongs and in column 2 the following p (sometime more than one). I hope someone can help me!
You can loop through all the partecipants and save a temporary array with the columns of each rows. Then you can display them as you wish. This is an example:
import textwrap
import os
from bs4 import BeautifulSoup
fname = "test.html"
with open(fname, 'r') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
n=-1
rows = []
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
n = n + 1
rows.append([name.text])
elif name is None:
rows[n].append(p.text)
# now print all the rows
for r in rows:
if len(r) > 1:
# here you can display them as you wish.
# r[0] contains the "strong" tag
# r[1] contains the next "p" tag
print("%s => %s" % (r[0], r[1]))
else:
# here you have only the "strong" tag
print(r[0])
Edit:
I removed class_='content_part hid', from the soup.find, removed one loop and added the multiprocess part, you can find info about multiprocess here:
import os
from bs4 import BeautifulSoup
import multiprocessing as mp
def process(filename):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname,errors='ignore') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div', id='article_qanda')
if not participants:
return
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
print()
print(name.text + " => ", end='')
elif name is None:
print(p.text, end=' ')
directory ='.'
if __name__ == '__main__':
p = mp.Pool()
p.map(process, os.listdir(directory))
Using the code of #rxw, i have edit his answer further in my final solution:
import textwrap
import os
from bs4 import BeautifulSoup
import pandas as pd
import textwrap
import os
from bs4 import BeautifulSoup
directory ='C:/Research syntheses - Meta analysis/Transcripts'
for filename in os.listdir(directory):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname,errors='ignore') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
if not participants: continue
n=-1
rows = []
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
n = n + 1
rows.append([name.text])
elif name is None:
rows[n].append(p.text)
# now print all the rows
for r in rows:
if len(r) > 1:
# here you can display them as you wish.
# r[0] contains the "strong" tag
# r[1] contains the next "p" tag
print("%s => %s" % (r[0], r[1]))
else:
# here you have only the "strong" tag
print(r[0])

scrape multiple page and put result in one CSV

i am a newbie in coding programs, and i start with python. i use it for scraping data from website, online shops to be specific. i want to scrape every page of the result page (with pagination) and put the result url in one csv
this is what i've been trying
import selenium
import bs4
from selenium import webdriver
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.tokopedia.com/p/rumah-tangga/alat-pertukangan/obeng?keyword=obeng&page='
chrome_path = '/home/yoga/Downloads/chromedriver'
driver = webdriver.Chrome(chrome_path)
#opening webpage
for number in range(10):
buka = driver.get(myurl + str(number))
page_source = driver.page_source
soup_this = soup(page_source, "html.parser")
product_links = soup_this.findAll("div",{"class":"product-summary"})
for number2 in range(10):
filename = "tokopedia" + str(number2) + ".csv"
f = open(filename, "w")
headers = "Link" + "\n"
f.write(headers)
for product in product_links:
barang = product.a["ng-href"]
print(barang + "\n")
f.write(barang + "\n")
f.close()
driver.close()
the result that i got in the csv is only for one page. can you guys help me?
import selenium
import bs4
from selenium import webdriver
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.tokopedia.com/p/rumah-tangga/alat-pertukangan/obeng?keyword=obeng&page='
chrome_path = '/home/yoga/Downloads/chromedriver'
driver = webdriver.Chrome(chrome_path)
filename = "tokopedia.csv"
f = open(filename, "w")
#opening webpage
for number in range(10):
buka = driver.get(myurl + str(number))
page_source = driver.page_source
soup_this = soup(page_source, "html.parser")
product_links = soup_this.findAll("div",{"class":"product-summary"})
headers = "Link" + "\n"
f.write(headers)
for product in product_links:
barang = product.a["ng-href"]
print(barang + "\n")
f.write(barang + "\n")
f.close()
driver.close()

Categories

Resources