Selenium Python - Writing from a try-except Loop To CSV - python

I have written a script that loops and prints the results, I am trying to add in saving to CSV, however I can't figure out how to do it.
I have got the code to save it which works on my other scripts but either it only prints one line, or prints 3 lines (one for each loop.
How do I do it so it prints all results?
This is the code I am working with
from selenium import webdriver
import time
browser = webdriver.Firefox(executable_path="/Users/**/Downloads/geckodriver")
browser.get('https://www.tripadvisor.co.uk/Restaurants-g186338-zfn29367-London_England.html#EATERY_OVERVIEW_BOX')
meci = browser.find_elements_by_class_name('listing')
filename ="scrape1.1.csv"
f = open(filename, 'w')
headers ="Title, URL, Rating\n "
f.write("")
while True:
try:
meci = browser.find_elements_by_class_name('listing')
for items in meci:
title_cont = items.find_element_by_class_name('property_title')
title = title_cont.text
href = title_cont.get_attribute('href')
rating = items.find_element_by_class_name('ui_bubble_rating')
ratingbubble = rating.get_attribute('alt').replace(' of 5 bubbles', '')
print(title)
print(href)
print(ratingbubble)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
except:
break
f.write(title + "," + href + "," + ratingbubble + "\n")
f.close()
browser.quit()

try this
from selenium import webdriver
import time
browser = webdriver.Firefox(executable_path="C:/Py/pythonv4/gecko/geckodriver")
browser.get('https://www.tripadvisor.co.uk/Restaurants-g186338-zfn29367-
London_England.html#EATERY_OVERVIEW_BOX')
meci = browser.find_elements_by_class_name('listing')
filename ="scrape1.1.csv"
f = open(filename, 'w')
headers ="Title, URL, Rating\n "
f.write("")
while True:
try:
meci = browser.find_elements_by_class_name('listing')
for items in meci:
title_cont = items.find_element_by_class_name('property_title')
title = title_cont.text
href = title_cont.get_attribute('href')
rating = items.find_element_by_class_name('ui_bubble_rating')
ratingbubble = rating.get_attribute('alt').replace(' of 5 bubbles', '')
print(title)
print(href)
print(ratingbubble)
f.write(title + "," + href + "," + ratingbubble + "\n")
time.sleep(5)
browser.find_element_by_css_selector('.next').click()
time.sleep(1)
except:
break
f.close()
browser.quit()

Related

Python getting stuck in exception flow

Very new to Python and currently stumped at the moment. What keeps happening is the first several rows work correctly, print, and write to the file just fine, but once it gets to row 11, it throws a "list index out of range" error, and does that for the remaining rows as well.
Cant figure out for the life of me how once it fails at row 11, to start over on row 12 and run the try again (should be successful on remaining rows)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://coinmarketcap.com/"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
rows = page_soup.find("tbody").findAll("tr")
filename = "coins.csv"
f = open(filename, "w")
headers = "Rank, Name, Price, Circulating Supply\n"
f.write(headers)
for row in rows:
try:
rank_number = row.findAll('td')[1].text
coin_name = row.findAll('td')[2].text
coin_price = row.findAll('td')[3].text
supply = row.findAll('td')[8].text
print(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
f.write(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
except Exception as e:
print(e)
continue
f.close()
Any help would be greatly appreciated!
Thanks!

Try, except, and finally statements

So I am developing a nitro type bot and I don't see a reason I should be getting an error but around 20 races in it is going directly to sending me an email. Am I use the statements wrong or what I am new to python so if it is really simple and a dumb mistake plz be nice and if it is something you could easily find on the web im really sorry.
try:
time.sleep(4)
driver.get('https://www.nitrotype.com/garage')
driver.implicitly_wait(20)
driver.find_element_by_css_selector('a.btn--light:nth-child(2)').click()
time.sleep(5)
driver.find_element_by_css_selector('button.btn--primary').click()
driver.implicitly_wait(10)
driver.find_element_by_css_selector('.dash-copyContainer')
time.sleep(4.25)
html = driver.page_source.replace(' ', ' ')
f = open("word.html", "w")
f.write(html)
f.close()
with open("word.html", "r") as html_file:
content = html_file.read()
soup = BeautifulSoup(content, 'lxml')
words = soup.find_all('span', class_='dash-letter')
stuff = ""
for span in words:
if span.text.isascii():
stuff += span.text
with open("Sentence.txt", "w") as wf:
wf.write(stuff)
wf.close()
e = open('Sentence.txt', 'r')
s = e.read()
Words = (s.split())
Delay = ((len(s.split()) / WPM) * 60)
int(Delay)
Delay1 = Delay / len(s.split())
for Word in Words:
pyautogui.typewrite(Word + " ")
time.sleep(Delay1)
time.sleep(2)
driver.get('https://www.nitrotype.com/garage')
except:
time.sleep(4)
driver.get('https://www.nitrotype.com/garage')
driver.implicitly_wait(20)
driver.find_element_by_css_selector('a.btn--light:nth-child(2)').click()
time.sleep(5)
driver.find_element_by_css_selector('button.btn--primary').click()
driver.implicitly_wait(10)
driver.find_element_by_css_selector('.dash-copyContainer')
time.sleep(4.25)
html = driver.page_source.replace(' ', ' ')
f = open("word.html", "w")
f.write(html)
f.close()
with open("word.html", "r") as html_file:
content = html_file.read()
soup = BeautifulSoup(content, 'lxml')
words = soup.find_all('span', class_='dash-letter')
stuff = ""
for span in words:
if span.text.isascii():
stuff += span.text
with open("Sentence.txt", "w") as wf:
wf.write(stuff)
wf.close()
e = open('Sentence.txt', 'r')
s = e.read()
Words = (s.split())
Delay = ((len(s.split()) / WPM) * 60)
int(Delay)
Delay1 = Delay / len(s.split())
for Word in Words:
pyautogui.typewrite(Word + " ")
time.sleep(Delay1)
time.sleep(2)
driver.get('https://www.nitrotype.com/garage')
finally:
driver1 = webdriver.Chrome(executable_path='/Users/Braeden/Downloads/chromedriver.exe')
driver1.get('https://accounts.google.com/ServiceLogin/signinchooser?service=mail&passive=true&rm=false&continue=https%3A%2F%2Fmail.google.com%2Fmail%2F&ss=1&scc=1&ltmpl=default&ltmplcache=2&emr=1&osid=1&flowName=GlifWebSignIn&flowEntry=ServiceLogin')
time.sleep(2)
driver1.find_element_by_css_selector('#identifierId')\
.send_keys(EU)
time.sleep(2)
driver1.find_element_by_css_selector('.VfPpkd-vQzf8d').click()
time.sleep(2)
driver1.find_element_by_css_selector('<div class="VfPpkd-RLmnJb"></div>')\
.send_keys(EP)
time.sleep(1)
driver1.find_element_by_css_selector('.VfPpkd-LgbsSe-OWXEXe-k8QpJ > span:nth-child(2)').click()
time.sleep(2)
driver1.find_element_by_css_selector('.VfPpkd-LgbsSe-OWXEXe-k8QpJ > div:nth-child(1)').click()
time.sleep(2)
driver1.find_element_by_css_selector('.T-I-KE').click()
time.sleep(2)
driver1.find_element_by_css_selector('#\:c1')\
.send_keys(TO)
driver1.find_element_by_css_selector('#\:co')\
.send_keys('Nitro type requires Captcha')
driver1.find_element_by_css_selector('#\:b9').click()
driver1.close()
input('Did you complete the captcha:')

in python, what should I add to fetch URLs form my (text file) or my (xml file) which include list of URLs?

I have this code which is all work fine with (one link )
Result of the code store values (availableOffers,otherpricess,currentprice,page_url) in (prices.csv) file
my problems are : First : I do not know what to write to fetch URLs form my (text file) or my (xml file) instead of one URL in this code
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "XXXXXXXXX"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
f = open(out_filename, "w")
f.write(headers)
f.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
f.close()
Second problem : when URL do not have value for (otherpricess ) I get this error
line 13, in <module>
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
AttributeError: 'NoneType' object has no attribute 'text'
how I bypass this error and tell the code to work even there are a value missing
thanks
To fetch urls from text file, you can open a file (exactly as you did for write) in "r" mode, and iterate over it's line.
For example, lets say you have the following urls file, named urls.txt:
http://www.google.com
http://www.yahoo.com
In order to fetch the urls and iterate over them, do the following:
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
with open(out_filename, "w") as fw:
fw.write(headers)
with open("urls.txt", "r") as fr:
for url in map(lambda x: x.strip(), fr.readlines()): # the strip is to remove the trailing '\n'
print(url)
uClient = uReq(url)
page_soup = soup(uClient.read(), "html.parser")
# write the rest logic here
# ...
# write to the output file
fw.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
Regarding your second question, you can check that page_soup.find("span", {"class": "price"}) is not None and if so, extract the text. For example:
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "") if page_soup.find("span", {"class": "price"}) else ""
# in case there is no value, otherpricess will be empty string but you can change it to any other value.

Value of one variable is automatically assigned to second one

I am learning web-scraping but while formatting the scraped data I came across a problem that my two variables i.e. first_line and second_line are both showing same value and that value is of second_line.
Inside the else when I tried printing out first_line then I got expected result but outside if and else first_line is showing copied value from second_line
while current_page < 201:
print(current_page)
url = base_url + loc + "&start=" + str(current_page)
yelp_r = requests.get(url)
yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser')
file_path = 'yelp-{loc}-2.txt'.format(loc=loc)
with open(file_path, "a") as textfile:
business = yelp_soup.findAll('div',{'class':'biz-listing-large'})
for biz in business:
title = biz.findAll('a', {'class':'biz-name'})[0].text
print(title)
second_line = ""
first_line = ""
try:
address = biz.findAll('address')[0].contents
for item in address:
if "br" in str(item):
second_line = second_line + item.getText()
else:
first_line = item.strip(" \n\t\r")
print(first_line)
print(first_line)
print(second_line)
except:
pass
print('\n')
try:
phone = biz.findAll('span',{'class':'biz-phone'})[0].text
except:
phone = None
print(phone)
page_line = "{title}\n{address_1}\n{address_2}\n{phone}".format(
title=title,
address_1=first_line,
address_2=second_line,
phone=phone
)
textfile.write(page_line)
current_page += 10
If you call .get_text() on a node, it gives you the full text. You can then split on newline to get your first and second line:
first_line, second_line = biz.findAll('address')[0].get_text().split('\n')
But since you just print f'{first_line}\n{second_line}', why do you need them separate at all?

Scraping Python Script: Looping and updating output

So I have this python script. Right now, I run the script and it gives me an output file in CSV.
What I want: When it finishes to restart and to check for changes to those output values (not refresh the output file when it restarts and erase all the previously collected data)
As well, it takes about 3 seconds per line of data to get retrieved. Does anyone know how I can get it going fast to handle large data sets?
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class':'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
#pass
return
ifile = open('output.csv', "a", 0)
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
inputs = csv.reader(open('input.csv'))
#inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
for i in inputs:
extract(i[0])
ifile.close()
print("finished")

Categories

Resources