I would like the script to scrape all items from each page and append to a csv file but there are 2 problem :
1) When I run the script it only go to single page (the last page = 64). It doesn't crawl from page 1 until 64
2) When the script writes data to csv file it doesn't append new lines but it re-writes the whole csv file.
import csv
# YouTube Video: https://www.youtube.com/watch?v=zjo9yFHoUl8
from selenium import webdriver
MAX_PAGE_NUM = 67
MAX_PAGE_DIG = 1
driver = webdriver.Chrome('/Users/reezalaq/PycharmProjects/untitled2/venv/driver/chromedriver')
with open('result.csv', 'w') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
for i in range(1, MAX_PAGE_NUM + 1):
page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
url = "https://www.blibli.com/jual/batik-pria?s=batik+pria&c=BA-1000013&i=" + page_num
driver.get(url)
buyers = driver.find_elements_by_xpath("//div[#class='product-title']")
prices = driver.find_elements_by_xpath("//span[#class='new-price-text']")
discount = driver.find_elements_by_xpath("//div[#class='discount']")
oldprice = driver.find_elements_by_xpath("//span[#class='old-price-text']")
num_page_items = len(buyers)
with open('result.csv', 'a') as f:
for c in range(num_page_items):
f.write(buyers[c].text + ' , ' + prices[c].text + ' , ' + discount[c].text + ' , ' + oldprice[c].text + '\n')
driver.close()
The main issue you had is an indentation problem that was just running your script using the last object found on the page.
Another issue I saw is that you were just putting all the titiles together, all the old prices together and so on.
For this reason it will be difficult to understand which price belongs to which item in case, for example, of items with missing data.
To solve this issue I've put all the items in a single webpage into the variable "products".
About the "append" or "write" option of the CSV in my implementation I check as first thing if the result.csv file exists.
Then we have two cases:
result.csv doesn't exist: I create it and I put headers in
result.csv already exists: it means that header is already in place and I can simply append new rows when looping
In order to get data out easily I've used BeautifulSoup (install it easily with pip).
There are still several challenges ahead because the data in this webpage is not consistent but the following example should be enough to get you going.
Please keep in mind that the "break" in the code will stop the scraping at the 1st page.
import csv
# YouTube Video: https://www.youtube.com/watch?v=zjo9yFHoUl8
from selenium import webdriver
from bs4 import BeautifulSoup
import os.path
MAX_PAGE_NUM = 67
MAX_PAGE_DIG = 1
driver = webdriver.Chrome('/Users/reezalaq/PycharmProjects/untitled2/venv/driver/chromedriver')
#driver = webdriver.Chrome()
def write_csv_header():
with open('result.csv', 'w') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
def write_csv_row(product_title, product_new_price, product_discount, product_old_price, product_link):
with open('result.csv', 'a') as f:
f.write(product_title + ' , ' + product_new_price + ' , ' + product_discount + ' , ' + product_old_price + ' , ' + product_link + '\n')
if os.path.isfile('result.csv'):
write_csv_header()
for i in range(1, MAX_PAGE_NUM + 1):
page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
url = "https://www.blibli.com/jual/batik-pria?s=batik+pria&c=BA-1000013&i=" + page_num
driver.get(url)
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
products = soup.findAll("a", {"class": "single-product"})
for product in products:
try:
product_title = product.find("div", {"class": "product-title"}).text.strip()
except:
product_title = "Not available"
try:
product_new_price = product.find("span", {"class": "new-price-text"}).text.strip()
except:
product_new_price = "Not available"
try:
product_old_price = product.find("span", {"class": "old-price-text"}).text.strip()
except:
product_old_price = "Not available"
try:
product_discount = product.find("div", {"class": "discount"}).text.strip()
except:
product_discount = "Not available"
try:
product_link = product['href']
except:
product_link = "Not available"
write_csv_row(product_title, product_new_price, product_discount, product_old_price, product_link)
break # this stops the parsing at the 1st page. I think it is a good idea to check data and fix all discrepancies before proceeding
driver.close()
If you want to append a new line to the file, you must use "a" argument instead of "w".
with open('result.csv', 'a') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
definition of "w" option:
Opens a file for writing only. Overwrites the file if the file exists.
If the file does not exist, creates a new file for writing.
definition of "a" option:
Opens a file for appending. The file pointer is at the end of the file
if the file exists. That is, the file is in the append mode. If the
file does not exist, it creates a new file for writing.
definition of "ab" option:
Opens a file for appending in binary format. The file pointer is at
the end of the file if the file exists. That is, the file is in the
append mode. If the file does not exist, it creates a new file for
writing.
Therefore, for appending new lines, you must use options that contain "a" (appending option).
Definitions are represented on this answer.
Related
I want to do a search using keywords from a file in a loop. using Selenium and BeatifulSoup
read 1st. row, put the value of it (one keyword) into the search query area, and search, when done, use the 2nd row from the file, and so on.
the read file part does print all keywords, one on each row, but I am not sure how to put it into the search query area, one at a time.
def SearchFuncs():
driver.get('https://www.website.com/search/?q=pet%20care') #put the value from one row on search/?q=
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
#print(a['title'])
return a
#SearchFuncs()
x = SearchFuncs()
print(x ['title'])
# read file sction:
with open ("kw-to-search.txt", "r") as f:
for line in f:
print(line.strip())
Updated: I also added save the result to file
but I tested the codes without save to file section
this is the code I tried using one of the solution (broderick) provided, thank you broderick, I don't get any output, and neither any error:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time
def SearchFuncs(addr):
driver.get(addr)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
#return a
#print(a ['title'])
with open ("kw.txt", "r") as f:
for line in f:
addr_to_search = 'https://www.website.com/search/?q='
# Build search query from lines
pieces = line.split()
query = ''
for i in range(len(pieces) - 1):
query += (pieces[i] + '%20')
query += pieces[-1]
# Debugging print
print(query)
addr_to_search += query
SearchFuncs(addr_to_search)
textList = a['title']
outF = open("keyword_result.txt", 'a')
for line in textList:
# write line to output file
outF.write(line)
#outF.write("\n")
outF.write(textList + '\n')
outF.close()
Updated with another code
This is another variation Arthur Pereira provided, thank you, Arthur Pereira
def SearchFuncs(url):
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
return a
#y = SearchFuncs(url)
#print(y ['title'])
#print(a['title'])
textList = a['title']
outF = open("Keyword_results-2.txt", 'a')
for line in textList:
# write line to output file
outF.write(line)
#outF.write("\n")
outF.write(textList + '\n')
outF.close()
with open("kw.txt", "r") as f:
for line in f:
query = line.strip().replace(" ", "%20")
url = "https://www.website.com/search/?q=" + query
SearchFuncs(url)
Error:
Traceback (most recent call last):
File "c:/Users/mycomp/Desktop/Python/test/Test-Search-on-Pin-fromList-1.py", line 45, in <module>
SearchFuncs(url)
File "c:/Users/mycomp/Desktop/Python/test/Test-Search-on-Pin-fromList-1.py", line 31, in SearchFuncs
textList = a['title']
TypeError: list indices must be integers or slices, not str
Iterate over each line in your text and prepare it to search. Then pass this url to your search function as a parameter:
Also I think you misuderstand the concept of return. Here your code is just returning the first a element and nothing should happen after it, leaving the function.
for a in soup.select('div.Eqh.F6l.Jea.k1A.zI7.iyn.Hsu a'):
return a
The error you are getting is beacuse it's not finding anything with your select, so it tries to create a list with a string as index:
textList = a['title']
So, assuming you want to get the text inside each anchor element you have to find the correct div and jup into the a element. Then you can get the title and write to a file.
def SearchFuncs(url):
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
collection = soup.select('div.Collection-Item a')
for item in collection:
title = item['title'].strip()
with open("Keyword_results-2.txt", 'a', encoding="utf-8") as outF:
outF.write(title + '\n') # write line to output file
with open("kw.txt", "r") as f:
for line in f:
query = line.strip().replace(" ", "%20")
url = "https://www.pinterest.com/search/pins/?q=" + query
SearchFuncs(url)
Try
def SearchFuncs(addr):
driver.get(addr)
...
and
with open ("kw-to-search.txt", "r") as f:
for line in f:
addr_to_search = 'https://www.website.com/search/?q='
# Build search query from lines
pieces = line.split()
query = ''
for i in range(len(pieces) - 1):
query += (pieces[i] + '%20')
query += pieces[-1]
# Debugging print
print(query)
addr_to_search += query
SearchFuncs(addr_to_search)
I am trying to scrape a website to get the title and prices but once the data is extracted and saved on the csv file the prices column formatting get disturbed and is not properly displayed in the column e.g $8,900 become $8 in one column and 900 is shifted to next column.
from selenium import webdriver
import time
max_pages = 1
driver = webdriver.Chrome()
with open('autotrader.csv', 'w') as f:
f.write("Title,Price \n")
for i in range(1, max_pages + 1):
url = "https://www.autotrader.co.uk/car-search?advertClassification=standard&postcode=WC2N%205DU&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&advertising-location=at_cars&is-quick-search=TRUE&include-delivery-option=on&page=" + str(max_pages)
driver.get(url)
title = driver.find_elements_by_xpath('//h3[#class="product-card-details__title"]')
price =driver.find_elements_by_xpath('//div[#class="product-card-pricing__price"]')
page_items = len(title)
with open('autotrader.csv', 'a') as f:
for i in range(page_items):
f.write(title[i].text + "," + price[i].text + "\n")
driver.close()
Use csv.writer and it will properly quote fields with delimiter characters in them:
import csv
# ... code to fetch titles and prices ...
with open('autotrader.csv', 'w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Price'])
for t,p in zip(title,price):
w.writerow([t.text,p.text])
I'm working on a web parser for a webpage containing mathematical constants. I need to replace some characters in order to have it on a specific format, but I dont know why if I print it, i seems to be working fine; but when I open the output file the format achieved by replace() doesn't seems to have took effect.
That's the code
#!/usr/bin/env python3
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "http://www.ebyte.it/library/educards/constants/ConstantsOfPhysicsAndMath.html"
soup = BeautifulSoup(urlopen(url).read(), "html5lib")
f = open("ebyteParse-output.txt", "w")
table = soup.find("table", attrs={"class": "grid9"})
rows = table.findAll("tr")
for tr in rows:
# If its a category of constants we write that as a comment
if tr.has_attr("bgcolor"):
f.write("\n\n# " + tr.find(text=True) + "\n")
continue
cols = tr.findAll("td")
if (len(cols) >= 2):
if (cols[0]["class"][0] == "box" or cols[0]["class"][0] == "boxi" and cols[1]["class"][0] == "boxa"):
constant = str(cols[0].find(text=True)).replace(" ", "-")
value = str(cols[1].find(text=True))
value = value.replace(" ", "").replace("...", "").replace("[", "").replace("]", "")
print(constant + "\t" + value)
f.write(constant + "\t" + value)
f.write("\n")
f.close()
That is what print shows:
That is what I get on the output file
Thanks you,
Salva
File i was looking for was catched so no changes where seen. Thanks for answering
So I have this python script. Right now, I run the script and it gives me an output file in CSV.
What I want: When it finishes to restart and to check for changes to those output values (not refresh the output file when it restarts and erase all the previously collected data)
As well, it takes about 3 seconds per line of data to get retrieved. Does anyone know how I can get it going fast to handle large data sets?
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class':'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
#pass
return
ifile = open('output.csv', "a", 0)
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
inputs = csv.reader(open('input.csv'))
#inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
for i in inputs:
extract(i[0])
ifile.close()
print("finished")
im trying to create a table scraper, I’ve written this code http://pastebin.com/t1wSPvbb
But i have problem with saving data in text, after 10 page the scraper save same data
this is the code whit problem :
for num in range(1,500):
print num
try:
resp = ''
resp = opener.open("http://login.site.com/view.asp?view&PAGE="+str(num))
soup = BeautifulSoup(resp.read())
for tr in soup.find_all('tr')[3:]:
tds = tr.find_all('td')
outfile.write (tds[2].text.encode('utf-8','replace') + ',' + tds[0].text.encode('utf-8','replace') + ',' + tds[1].text.encode('utf-8','replace') + '\n')
pass
except:
pass
After 10 pages, the scraper saves the same data for the next few pages.