So I have this python script. Right now, I run the script and it gives me an output file in CSV.
What I want: When it finishes to restart and to check for changes to those output values (not refresh the output file when it restarts and erase all the previously collected data)
As well, it takes about 3 seconds per line of data to get retrieved. Does anyone know how I can get it going fast to handle large data sets?
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class':'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
#pass
return
ifile = open('output.csv', "a", 0)
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
inputs = csv.reader(open('input.csv'))
#inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
for i in inputs:
extract(i[0])
ifile.close()
print("finished")
Related
Very new to Python and currently stumped at the moment. What keeps happening is the first several rows work correctly, print, and write to the file just fine, but once it gets to row 11, it throws a "list index out of range" error, and does that for the remaining rows as well.
Cant figure out for the life of me how once it fails at row 11, to start over on row 12 and run the try again (should be successful on remaining rows)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://coinmarketcap.com/"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
rows = page_soup.find("tbody").findAll("tr")
filename = "coins.csv"
f = open(filename, "w")
headers = "Rank, Name, Price, Circulating Supply\n"
f.write(headers)
for row in rows:
try:
rank_number = row.findAll('td')[1].text
coin_name = row.findAll('td')[2].text
coin_price = row.findAll('td')[3].text
supply = row.findAll('td')[8].text
print(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
f.write(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
except Exception as e:
print(e)
continue
f.close()
Any help would be greatly appreciated!
Thanks!
I have written a script that loops and prints the results, I am trying to add in saving to CSV, however I can't figure out how to do it.
I have got the code to save it which works on my other scripts but either it only prints one line, or prints 3 lines (one for each loop.
How do I do it so it prints all results?
This is the code I am working with
from selenium import webdriver
import time
browser = webdriver.Firefox(executable_path="/Users/**/Downloads/geckodriver")
browser.get('https://www.tripadvisor.co.uk/Restaurants-g186338-zfn29367-London_England.html#EATERY_OVERVIEW_BOX')
meci = browser.find_elements_by_class_name('listing')
filename ="scrape1.1.csv"
f = open(filename, 'w')
headers ="Title, URL, Rating\n "
f.write("")
while True:
try:
meci = browser.find_elements_by_class_name('listing')
for items in meci:
title_cont = items.find_element_by_class_name('property_title')
title = title_cont.text
href = title_cont.get_attribute('href')
rating = items.find_element_by_class_name('ui_bubble_rating')
ratingbubble = rating.get_attribute('alt').replace(' of 5 bubbles', '')
print(title)
print(href)
print(ratingbubble)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
except:
break
f.write(title + "," + href + "," + ratingbubble + "\n")
f.close()
browser.quit()
try this
from selenium import webdriver
import time
browser = webdriver.Firefox(executable_path="C:/Py/pythonv4/gecko/geckodriver")
browser.get('https://www.tripadvisor.co.uk/Restaurants-g186338-zfn29367-
London_England.html#EATERY_OVERVIEW_BOX')
meci = browser.find_elements_by_class_name('listing')
filename ="scrape1.1.csv"
f = open(filename, 'w')
headers ="Title, URL, Rating\n "
f.write("")
while True:
try:
meci = browser.find_elements_by_class_name('listing')
for items in meci:
title_cont = items.find_element_by_class_name('property_title')
title = title_cont.text
href = title_cont.get_attribute('href')
rating = items.find_element_by_class_name('ui_bubble_rating')
ratingbubble = rating.get_attribute('alt').replace(' of 5 bubbles', '')
print(title)
print(href)
print(ratingbubble)
f.write(title + "," + href + "," + ratingbubble + "\n")
time.sleep(5)
browser.find_element_by_css_selector('.next').click()
time.sleep(1)
except:
break
f.close()
browser.quit()
I'm working on a web parser for a webpage containing mathematical constants. I need to replace some characters in order to have it on a specific format, but I dont know why if I print it, i seems to be working fine; but when I open the output file the format achieved by replace() doesn't seems to have took effect.
That's the code
#!/usr/bin/env python3
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "http://www.ebyte.it/library/educards/constants/ConstantsOfPhysicsAndMath.html"
soup = BeautifulSoup(urlopen(url).read(), "html5lib")
f = open("ebyteParse-output.txt", "w")
table = soup.find("table", attrs={"class": "grid9"})
rows = table.findAll("tr")
for tr in rows:
# If its a category of constants we write that as a comment
if tr.has_attr("bgcolor"):
f.write("\n\n# " + tr.find(text=True) + "\n")
continue
cols = tr.findAll("td")
if (len(cols) >= 2):
if (cols[0]["class"][0] == "box" or cols[0]["class"][0] == "boxi" and cols[1]["class"][0] == "boxa"):
constant = str(cols[0].find(text=True)).replace(" ", "-")
value = str(cols[1].find(text=True))
value = value.replace(" ", "").replace("...", "").replace("[", "").replace("]", "")
print(constant + "\t" + value)
f.write(constant + "\t" + value)
f.write("\n")
f.close()
That is what print shows:
That is what I get on the output file
Thanks you,
Salva
File i was looking for was catched so no changes where seen. Thanks for answering
Running a program in cmd; the print function
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
Which returns: Scraping URL 1 of 400
Over and over till count ends.
What i'm trying to learn today, is printing 2 outcomes on 2 separate lines, over and over till loop ends.
Example:
Scraping URL 1 of 400 Where bold character is only thing changing
Then if the scraper finds a result in the list;
Adding Result 1 to CSV Where bold character is only thing changing
So far i have tried a few print commands, but it either overwrites the entire sentence on the same line;
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
print '\r' 'URL_FOUND' + str(index+1) + 'adding to CSV',
If i try to link to two print functions to an else argument, it will only print the first statement and the second is not acknowledged.
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
else:
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
print '\n' 'title'
Just wondering if anyone could point me in the right direction for printing two outcomes on 2 lines.
Full code below if required:
import requests
import csv
import datetime
import pandas as pd
import csv
from lxml import html
df = pd.read_excel("C:\Python27\Projects\REA_SCRAPER\\REA.xlsx", sheetname="REA")
dnc = df['Property']
dnc_list = list(dnc)
url_base = "https://www.realestate.com.au/property/"
URL_LIST = []
for nd in dnc_list:
nd = nd.strip()
nd = nd.lower()
nd = nd.replace(" ", "-")
URL_LIST.append(url_base + nd)
text2search = '''RECENTLY SOLD'''
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
I would have recommended curses, but you're on Windows and just writing what appears to be a small script; reason enough to not go down that rabbit hole.
The reason you are seeing your lines overwrite each other is because you are printing carriage returns \r, which moves the cursor to the start of the line. Any text written thereafter will overwrite previous printed text.
I found this with a quick Google, which may be of interest to you.
I'm new to Python and any type of coding ...I hope this is not too easy question.
I'm trying to make a csv file from the scrape data from the web.
AttributeError: 'Doctype' object has no attribute 'find_all'
But this error wont go away!
here's the whole code
import bs4 as bs
import urllib.request
req = urllib.request.Request('http://www.mobygames.com/game/tom-clancys-rainbow-six-siege',headers={'User-Agent': 'Mozilla/5.0'})
sauce = urllib.request.urlopen(req).read()
soup = bs.BeautifulSoup(sauce,'lxml')
scores = soup.find_all("div")
filename = "scores1.csv"
f = open(filename, "w")
headers = "Hi, Med, Low\n"
f.write(headers)
for scores in soup:
scoreHi = scores.find_all("div", {"class":"scoreHi"})
Hi = scoreHi[0].text
scoreMed = scores.find_all("div", {"class":"scoreMed"})
Med = scoreMed[0].text
scoreLow = scores.find_all("div", {"class":"scoreLow"})
Low = scoreLow[0].text
print ("Hi: " + Hi)
print ("Med: " + Med)
print ("Low: "+ Low)
f.write(Hi + "," + Med.replace(",","|") + "," + Low + "\n")
f.close()
You first assign to scores:
scores = soup.find_all("div")
which is fine, but you then should walk over those scores:
for score in scores:
scoreHi = score.find_all("div", {"class":"scoreHi"})
Hi = scoreHi[0].text
scoreMed = score.find_all("div", {"class":"scoreMed"})
Med = scoreMed[0].text
scoreLow = score.find_all("div", {"class":"scoreLow"})
Low = scoreLow[0].text
Trying to iterate over the Doc (i.e. soup) using:
for scores in soup:
makes no sense.