Can not create folder through command with my py file? - python

I've just tried to make a beautifulsoup file for aliexpress, even though the file runs on command properly, I couldn't create a folder.
Can you please help me?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.aliexpress.com/category/200003482/dresses.html?spm=2114.11010108.101.3.650c649b3Cw8J9&g=y'
#opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, 'html.parser')
#grabs each product
containers = page_soup.findAll('div',{'class':'item'})
filename = 'product.csv'
f = open(filename, 'w')
headers = 'product_name, item_price, store_name\n'
f.write(headers)
contain = containers[0]
container = containers[0]
for container in containers:
product_name = container.h3.a.text
product_container = container.findAll('span',{'class':'price price-m'})
price_container = product_container[0].findAll('span',{'class':'value'})
item_price = price_container[0].text
store_container = container.findAll('div',{'class':'info-more'})
store_container[0].findAll('div',{'class':'store-name util-clearfix'})
name_container = store_container[0].findAll('div',{'class':'store-name util-clearfix'})
store_name = name_container[0].a.text
print('product_name: ' + product_name)
print('item_price: ' + item_price)
print('store_name: ' + store_name)
f.write(product_name + ',' + item_price + ',' store_name + '\n')
f.close()

Related

Unable to add text file to path in python

I was trying to scrape some data using BeautifulSoup on python from a site which has some products and then store it in text files in separate folders. Here in the given code I am stuck nearing the end of the same. I have added the rest just as a reference.
import unittest, time, random
import urllib.request
import os
from selenium import webdriver
from selenium.common.exceptions import InvalidArgumentException
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
links = []
soup_list = []
imgs = []
website = "https://www.energystar.gov/productfinder"
rate = [i/10 for i in range(10)]
cnt = 0
quote = '"'
newline = '\n'
colon = ' : '
browser = webdriver.Firefox(executable_path="C:\\Users\\abc\\.wdm\\drivers\\geckodriver\\win64\\v0.29.1\\geckodriver.exe")
url2 = 'https://www.energystar.gov/productfinder/product/certified-room-air-cleaners/results?page_number='
def getdata(url):
browser.get(url)
content = browser.page_source
soup1 = BeautifulSoup(content, "html.parser")
return soup1
#pagenos = ['0','13']
pagenos = []
for i in range(0,2):
pagenos.append(i)
i =+ 1
print(pagenos)
for i in range(0,len(pagenos)):
url = url2 + str(pagenos[i])
soup1 = getdata(url)
soup_list.append(soup1)
for main in soup1.findAll('div', attrs = {'class' : 'row certified-room-air-cleaners'}):
name=main.find('a', href=True)
if (name != ''):
links.append((name.get('href')).strip())
print("Got links : ", len(links))
print("Got soups : ", len(soup_list))
#print('Soup 1:', soup_list[1])
for link in links:
#just for testing 10 links
cnt = cnt + 1
if cnt >= 20:
break
# time delay before we access the next page..
time.sleep(random.choice(rate))
#print("Fetching link..... : ", link)
link = link[5:]
#print("Fetching link..... : ", link)
link = website + link
browser.get(link)
linkcontent = browser.page_source
soup2 = BeautifulSoup(linkcontent, "html.parser")
pmname = soup2.find('div', attrs={'class' : 'l-wrapper'}).find('h1')
if not pmname:
print("Error no product name for link : ", link)
continue
pmname = pmname.text.strip().split(' - ')
bname = pmname[0].strip()
mname = pmname[1].strip()
#print(bname)
#print(mname)
# Creating folder with Brand Name as name
try:
os.makedirs(str(bname))
except FileExistsError:
# directory already exists
pass
# Creating text folders with model number as name
fname = mname + '.txt'
path = '/' + str(bname)
a = os.path.join(path, fname)
print(a)
ff = open(fname, mode='w')
ff.write("BRAND NAME : " + quote + bname + quote + newline)
ff.write("MODEL : " + quote + mname + quote + newline)
browser.close()
exit()
Here I can successfully create the text file but cant seem to add it the the created path.
You are missing something:
If you are adding anything in list then you must have in write mode to write file but you want to append in exist data then file open must have "a" mode to append data at end and before to write any data inside file. file must be open first then close at end of file to close process of file after finish.

Get Title from img delivers NoneType object

I tried to create a web crawler, unfortunately an error occurred during execution. This web crawler should read the data from the website. I get the title from the picture.
brand = make_rating_sp[0].img["title"].title()
TypeError: 'NoneType' object is not subscriptable
Why is this error and how can I fix it?
I would be very happy to receive an answer.
pip install urlopen
pip install beautifulsoup4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
containers = page_soup.findAll("div", {"class": "item-container"})
out_filename = "graphics_cards.csv"
headers = "brand,product_name,shipping \n"
f = open(out_filename, "w")
f.write(headers)
for container in containers:
divWithInfo = containers[1].find("div","item-info")
make_rating_sp = container.div.select("a")
brand = make_rating_sp[2].img["title"].title()
product_name = container.div.select("a")[2].text
shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")
print("brand: " + brand + "\n")
print("product_name: " + product_name + "\n")
print("shipping: " + shipping + "\n")
f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n")
f.close() # Close the file

Scraping does not export to Excel

I am trying my first time to scrape the information from website and export it to excel file. However, not the whole information is scraped, nor the file is created for the export.
This is what I get in anaconda:
(base) C:\Windows\system32>firstwebscrape.py
brand: []
product_name: ASRock Radeon RX 5700 XT DirectX 12 RX 5700 XT TAICHI X 8G OC+ Video Card
product_price: €446,99 
Here is the code
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/global/lt-en/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphic%20card'
#opening up the connection grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML parser
page_soup = soup(page_html, "html.parser")
#grabs all containers
containers = page_soup.findAll("div",{"class":"item-container"})
filename= "123.csv"
f = open(filename, "w")
headers = "brand, product_name, product_price\n"
f.write(headers)
for container in containers:
brand = container.findAll("a",{"class":"title"})
title_container = container.findAll("a",{"class":"item-title"})
product_name = title_container[0].text
price_container = container.findAll("li",{"class":"price-current"})
product_price = price_container[0].text.strip()
print("brand: ", brand)
print("product_name: " + product_name)
print("product_price: " + product_price)
f.write(str(brand) + "," + product_name.replace(",", "|") + "," + product_price + "\n")
f.close()
Your code runs fine. Just correct this in your loop:
for container in containers:
brand = container.findAll("a",{"class":"title"})
title_container = container.findAll("a",{"class":"item-title"})
product_name = title_container[0].text
price_container = container.findAll("li",{"class":"price-current"})
product_price = price_container[0].text.strip()
# these code lines have to be in your for loop!
print("brand: ", brand)
print("product_name: " + product_name)
print("product_price: " + product_price)
f.write(str(brand) + "," + product_name.replace(",", "|") + "," + product_price + "\n")
You want to print and save for every item in your iteration over containers. Otherwise only the last item gets saved to your CSV.
Here is a solution that splits the job between getting the data, extracting and writing the result. It also leaves the job of writing the csv data to the csvmodule.
import csv
import re
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup
def extract_brand(c):
"""Locate brand
At most 3 words in brand
"""
tag = c.find('img', class_='lazy-img')
tmp = tag.get('alt')
m = re.match(r'(\w+\s?){1,3}', tmp)
brand = m.group(0).rstrip() if m else 'No Brand Found'
return brand
def extract_product(c):
title_container = c.find('a', class_='item-title')
product_name = title_container.string
return product_name.replace(',', '|').strip()
def extract_price(c):
price_container = c.find('li', class_='price-current')
tmp = price_container.string
if not tmp:
tmp = ''.join(price_container.stripped_strings)
m = re.match(r'(.\d[\d.,]+)', tmp.strip())
product_price = m.group(0) if m else "?"
return product_price
def extract_from(page):
"""Extract data for each product
Return a list containing data for one product per list item.
"""
containers = page.find_all('div', class_='item-container')
data = []
for container in containers:
item = []
item.append(extract_brand(container))
item.append(extract_product(container))
item.append(extract_price(container))
data.append(item)
return data
def write2csv(filename, data):
with open(filename, 'w', newline='') as csvfile:
fd = csv.writer(csvfile)
headers = ["brand", "product_name", "product_price"]
fd.writerow(headers)
fd.writerows(data)
def get_html_from(url, parser='html.parser'):
with uReq(url) as uClient:
page_html = uClient.read()
page_soup = BeautifulSoup(page_html, parser)
return page_soup
my_url = ('https://www.newegg.com/global/lt-en/Video-Cards-Video-Devices/'
'Category/ID-38?Tpk=graphic%20card')
page = get_html_from(my_url)
data = extract_from(page)
filename = "1234.csv"
write2csv(filename, data)

scrape multiple page and put result in one CSV

i am a newbie in coding programs, and i start with python. i use it for scraping data from website, online shops to be specific. i want to scrape every page of the result page (with pagination) and put the result url in one csv
this is what i've been trying
import selenium
import bs4
from selenium import webdriver
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.tokopedia.com/p/rumah-tangga/alat-pertukangan/obeng?keyword=obeng&page='
chrome_path = '/home/yoga/Downloads/chromedriver'
driver = webdriver.Chrome(chrome_path)
#opening webpage
for number in range(10):
buka = driver.get(myurl + str(number))
page_source = driver.page_source
soup_this = soup(page_source, "html.parser")
product_links = soup_this.findAll("div",{"class":"product-summary"})
for number2 in range(10):
filename = "tokopedia" + str(number2) + ".csv"
f = open(filename, "w")
headers = "Link" + "\n"
f.write(headers)
for product in product_links:
barang = product.a["ng-href"]
print(barang + "\n")
f.write(barang + "\n")
f.close()
driver.close()
the result that i got in the csv is only for one page. can you guys help me?
import selenium
import bs4
from selenium import webdriver
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.tokopedia.com/p/rumah-tangga/alat-pertukangan/obeng?keyword=obeng&page='
chrome_path = '/home/yoga/Downloads/chromedriver'
driver = webdriver.Chrome(chrome_path)
filename = "tokopedia.csv"
f = open(filename, "w")
#opening webpage
for number in range(10):
buka = driver.get(myurl + str(number))
page_source = driver.page_source
soup_this = soup(page_source, "html.parser")
product_links = soup_this.findAll("div",{"class":"product-summary"})
headers = "Link" + "\n"
f.write(headers)
for product in product_links:
barang = product.a["ng-href"]
print(barang + "\n")
f.write(barang + "\n")
f.close()
driver.close()

exclude unwanted text in multiple lines that came up by parsing html with BeautifulSoup

I am using the following code:
import urllib
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
years = list(range(1956,2016))
for year in years:
my_urls = ('http://www.hitparadeitalia.it/hp_yends/hpe' + str(year) + '.htm',)
my_url = my_urls[0]
for my_url in my_urls:
uClient = uReq(my_url)
html_input = uClient.read()
uClient.close()
page_soup = BeautifulSoup(html_input, "html.parser")
container = page_soup.findAll("li")
filename = "singoli" + str(year) + ".csv"
f = open(filename, "w")
headers = "lista"
f.write(headers)
lista = container[0].text
print("lista: " + lista)
f.write(lista + "\n")
f.close()
I get text that does not seem to be in the "li" container but it gets written in the outputs. This is the unwanted text:
<!--
google_ad_client = "ca-pub-9635531430093553";
/* in medias res */
google_ad_slot = "9880694813";
google_ad_width = 468;
google_ad_height = 60;
//-->
How can I get rid of it?
The text you don't want is coming from a script element. So get rid of the script elements before you start and it works:
import urllib
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
years = list(range(1956,2016))
for year in years:
my_urls = ('http://www.hitparadeitalia.it/hp_yends/hpe' + str(year) + '.htm',)
my_url = my_urls[0]
for my_url in my_urls:
uClient = uReq(my_url)
html_input = uClient.read()
uClient.close()
page_soup = BeautifulSoup(html_input, "html.parser")
[s.extract() for s in page_soup('script')]
container = page_soup.findAll("li")
filename = "singoli" + str(year) + ".csv"
f = open(filename, "w")
headers = "lista"
f.write(headers)
lista = container[0].text
print("lista: " + lista)
f.write(lista + "\n")
f.close()
All I did was to add the one line:
[s.extract() for s in page_soup('script')]
Which finds the script elements and removes them.

Categories

Resources