I am learning web-scraping but while formatting the scraped data I came across a problem that my two variables i.e. first_line and second_line are both showing same value and that value is of second_line.
Inside the else when I tried printing out first_line then I got expected result but outside if and else first_line is showing copied value from second_line
while current_page < 201:
print(current_page)
url = base_url + loc + "&start=" + str(current_page)
yelp_r = requests.get(url)
yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser')
file_path = 'yelp-{loc}-2.txt'.format(loc=loc)
with open(file_path, "a") as textfile:
business = yelp_soup.findAll('div',{'class':'biz-listing-large'})
for biz in business:
title = biz.findAll('a', {'class':'biz-name'})[0].text
print(title)
second_line = ""
first_line = ""
try:
address = biz.findAll('address')[0].contents
for item in address:
if "br" in str(item):
second_line = second_line + item.getText()
else:
first_line = item.strip(" \n\t\r")
print(first_line)
print(first_line)
print(second_line)
except:
pass
print('\n')
try:
phone = biz.findAll('span',{'class':'biz-phone'})[0].text
except:
phone = None
print(phone)
page_line = "{title}\n{address_1}\n{address_2}\n{phone}".format(
title=title,
address_1=first_line,
address_2=second_line,
phone=phone
)
textfile.write(page_line)
current_page += 10
If you call .get_text() on a node, it gives you the full text. You can then split on newline to get your first and second line:
first_line, second_line = biz.findAll('address')[0].get_text().split('\n')
But since you just print f'{first_line}\n{second_line}', why do you need them separate at all?
Related
I am running a python scraper that scrapes quotes from a webpage and outputs the result into a CSV file.
I have not written this myself because I am a beginner, but as I was running this code to test it out and use parts of it myself I got this error. I know what the error means but I am pretty clueless how to approach to fix this. I would like to push an update to the github of the author to help.
Traceback (most recent call last):
File "quotes.py", line 100, in <module>
get_authors()
File "quotes.py", line 58, in get_authors
quote_details = fetch_quote(url)
File "quotes.py", line 77, in fetch_quote
tempString += ("\"%s\","%next(q.find_class('b-qt')[0].iter('a')).text)
IndexError: list index out of range
The problem happens when it starts to fetch the quotes. Creating a list of authors and a list of the URL's works without any issues. The IndexError happens after it creates the CSV file, thats the moment the error gets thrown so I assume the problem is with this part of the code:
tempString += ("\"%s\","%next(q.find_class('b-qt')[0].iter('a')).text)
Does this sound about right? I have absolutely no clue how to solve errors in Python beyond typeErrors and some of the more simpler IndexErrors. I would love to learn but all my searching on stackoverflow showed a lot of people with the same issue when it comes to CSV files. But all the answers were very specific.
#!/usr/bin/python
import requests
from lxml import html
import time
import string
def get_authors():
baseUrl = 'http://www.brainyquote.com'
urlString = 'http://www.brainyquote.com/authors/'
authorsUrl = [urlString + x for x in list(string.lowercase[:26])]
urlsList = [] # authors list page urls
print ""
print "Scanning Started for page links"
print ""
for url in authorsUrl:
print "Scanning URL: %s"%url
urlsList.append(url)
urlsList.extend(pagination(url, False))
authorsList = []
print ""
print "Scanning Started for Author Pages"
print ""
for url in urlsList:
print "Scanning URL: %s"%url
authorsList.extend(get_authors_links(url))
# Write all authors links
authorsFile = open("authors.txt","a+")
for urls in authorsList:
authorsFile.write(baseUrl + urls.encode('utf-8') + "\n")
authorsFile.close()
quoteLinks = []
# Write all authors links
print ""
print "Scanning Started for Quote Page Links"
print ""
for url in authorsList:
newUrl = (baseUrl + url)
print "Scanning URL: %s"%newUrl
quoteLinks.append(newUrl)
arr = pagination(newUrl, True)
quoteLinks.extend(arr)
# Write all quotes link
linksFile = open("quotes_links.txt","a+")
for url in quoteLinks:
linksFile.write(url.encode('utf-8') + "\n")
linksFile.close()
print ""
print "Scanning Started for fetching quotes"
print ""
# Write all quotes
quotesFile = open("quotes.csv","a+")
for url in quoteLinks:
quote_details = fetch_quote(url)
quotesFile.write(quote_details.encode('utf-8') + "\n")
print ""
print "All Done \nThanks for using it...!!!"
print ""
def get_authors_links(url):
page = requests.get(url)
tree = html.fromstring(page.text)
arr = tree.xpath('//table[#class="table table-hover table-bordered"]//td/a/#href')
return arr
def fetch_quote(url):
page = requests.get(url)
tree = html.fromstring(page.text)
quotes = tree.find_class('bqQt')
tempString = ""
for q in quotes:
tempString += ("\"%s\","%next(q.find_class('b-qt')[0].iter('a')).text)
tempString += ("%s,"%next(q.find_class('bq-aut')[0].iter('a')).text)
for element in q.find_class('oncl_k'):
tempString += "%s "%element.text
tempString += "\n"
return tempString
def pagination(url, htmlPage): # .html or not - htmlPage True or False
arr = []
page = requests.get(url)
tree = html.fromstring(page.text)
end = tree.xpath('//div[#class="row paginationContainer"]//nav//ul/li[last()-1]/a/text()')
if len(end):
if(htmlPage):
url = url.split('.html')[0]
for count in range(2, int(end[0])+1):
arr.append(url+"%s.html"%(count))
else:
for count in range(2, int(end[0])+1):
arr.append(url+"%s"%(count))
return arr
if __name__ == '__main__':
get_authors()#!/usr/bin/python
import requests
from lxml import html
import time
import string
def get_authors():
baseUrl = 'http://www.brainyquote.com'
urlString = 'http://www.brainyquote.com/authors/'
authorsUrl = [urlString + x for x in list(string.lowercase[:26])]
urlsList = [] # authors list page urls
print ""
print "Scanning Started for page links"
print ""
for url in authorsUrl:
print "Scanning URL: %s"%url
urlsList.append(url)
urlsList.extend(pagination(url, False))
authorsList = []
print ""
print "Scanning Started for Author Pages"
print ""
for url in urlsList:
print "Scanning URL: %s"%url
authorsList.extend(get_authors_links(url))
# Write all authors links
authorsFile = open("authors.txt","a+")
for urls in authorsList:
authorsFile.write(baseUrl + urls.encode('utf-8') + "\n")
authorsFile.close()
quoteLinks = []
# Write all authors links
print ""
print "Scanning Started for Quote Page Links"
print ""
for url in authorsList:
newUrl = (baseUrl + url)
print "Scanning URL: %s"%newUrl
quoteLinks.append(newUrl)
arr = pagination(newUrl, True)
quoteLinks.extend(arr)
# Write all quotes link
linksFile = open("quotes_links.txt","a+")
for url in quoteLinks:
linksFile.write(url.encode('utf-8') + "\n")
linksFile.close()
print ""
print "Scanning Started for fetching quotes"
print ""
# Write all quotes
quotesFile = open("quotes.csv","a+")
for url in quoteLinks:
quote_details = fetch_quote(url)
quotesFile.write(quote_details.encode('utf-8') + "\n")
print ""
print "All Done \nThanks for using it...!!!"
print ""
def get_authors_links(url):
page = requests.get(url)
tree = html.fromstring(page.text)
arr = tree.xpath('//table[#class="table table-hover table-bordered"]//td/a/#href')
return arr
def fetch_quote(url):
page = requests.get(url)
tree = html.fromstring(page.text)
quotes = tree.find_class('bqQt')
tempString = ""
for q in quotes:
tempString += ("\"%s\","%next(q.find_class('b-qt')[0].iter('a')).text)
tempString += ("%s,"%next(q.find_class('bq-aut')[0].iter('a')).text)
for element in q.find_class('oncl_k'):
tempString += "%s "%element.text
tempString += "\n"
return tempString
def pagination(url, htmlPage): # .html or not - htmlPage True or False
arr = []
page = requests.get(url)
tree = html.fromstring(page.text)
end = tree.xpath('//div[#class="row paginationContainer"]//nav//ul/li[last()-1]/a/text()')
if len(end):
if(htmlPage):
url = url.split('.html')[0]
for count in range(2, int(end[0])+1):
arr.append(url+"%s.html"%(count))
else:
for count in range(2, int(end[0])+1):
arr.append(url+"%s"%(count))
return arr
if __name__ == '__main__':
get_authors()
Any ideas or pointers would be much appreciated. From what I know this should not be hard to fix but as a beginner the idea to change 3 lines in longer code than I am used to is very daunting.
All credit to the author, I hope I can push a fix with your help:
https://github.com/ravingupta/brainyquote/
This works (code converted to python 3)
import requests
from lxml import html
import string
def get_authors():
baseUrl = 'http://www.brainyquote.com'
urlString = 'http://www.brainyquote.com/authors/'
authorsUrl = [urlString + x for x in list(string.ascii_lowercase[:26])]
urlsList = [] # authors list page urls
print("")
print("Scanning Started for page links")
print("")
for url in authorsUrl:
print("Scanning URL: %s" % url)
urlsList.append(url)
urlsList.extend(pagination(url, False))
authorsList = []
print("")
print("Scanning Started for Author Pages")
print("")
for url in urlsList:
print("Scanning URL: %s" % url)
authorsList.extend(get_authors_links(url))
# Write all authors links
authorsFile = open("authors.txt", "a+")
for urls in authorsList:
authorsFile.write(baseUrl + str(urls.encode('utf-8')) + "\n")
authorsFile.close()
quoteLinks = []
# Write all authors links
print("")
print("Scanning Started for Quote Page Links")
print("")
for url in authorsList:
newUrl = (baseUrl + url)
print("Scanning URL: %s" % newUrl)
quoteLinks.append(newUrl)
arr = pagination(newUrl, True)
quoteLinks.extend(arr)
# Write all quotes link
linksFile = open("quotes_links.txt", "a+")
for url in quoteLinks:
linksFile.write(str(url.encode('utf-8')) + "\n")
linksFile.close()
print("")
print("Scanning Started for fetching quotes")
print("")
# Write all quotes
quotesFile = open("quotes.csv", "a+")
for url in quoteLinks:
quote_details = fetch_quote(url)
quotesFile.write(str(quote_details.encode('utf-8')) + "\n")
print("")
print("All Done \nThanks for using it...!!!")
print("")
def get_authors_links(url):
page = requests.get(url)
tree = html.fromstring(page.text)
arr = tree.xpath('//table[#class="table table-hover table-bordered"]//td/a/#href')
return arr
def fetch_quote(url):
page = requests.get(url)
tree = html.fromstring(page.text)
quotes = tree.find_class('bqQt')
tempString = ""
for q in quotes:
tempString += ("\"%s\"," % next(q.find_class('b-qt')[0].iter('a')).text)
tempString += ("%s," % next(q.find_class('bq-aut')[0].iter('a')).text)
for element in q.find_class('oncl_k'):
tempString += "%s " % element.text
tempString += "\n"
return tempString
def pagination(url, htmlPage): # .html or not - htmlPage True or False
arr = []
page = requests.get(url)
tree = html.fromstring(page.text)
end = tree.xpath('//div[#class="row paginationContainer"]//nav//ul/li[last()-1]/a/text()')
if len(end):
if (htmlPage):
url = url.split('.html')[0]
for count in range(2, int(end[0]) + 1):
arr.append(url + "%s.html" % (count))
else:
for count in range(2, int(end[0]) + 1):
arr.append(url + "%s" % (count))
return arr
if __name__ == '__main__':
get_authors()
Hello Community Members,
I am getting the error NameError: name 'f' is not defined. The code is as follows. Please help. Any sort of help is appreciated. I have been strucked onto this since 3 days. The code is all about to extract all the subcategories name of wikipedia category in Python 3.
I have tried both the relative and absolute paths.
The code is as follows:
import httplib2
from bs4 import BeautifulSoup
import subprocess
import time, wget
import os, os.path
#declarations
catRoot = "http://en.wikipedia.org/wiki/Category:"
MAX_DEPTH = 100
done = []
ignore = []
path = 'trivial'
#Removes all newline characters and replaces with spaces
def removeNewLines(in_text):
return in_text.replace('\n', ' ')
# Downloads a link into the destination
def download(link, dest):
# print link
if not os.path.exists(dest) or os.path.getsize(dest) == 0:
subprocess.getoutput('wget "' + link + '" -O "' + dest+ '"')
print ("Downloading")
def ensureDir(f):
if not os.path.exists(f):
os.mkdir(f)
# Cleans a text by removing tags
def clean(in_text):
s_list = list(in_text)
i,j = 0,0
while i < len(s_list):
#iterate until a left-angle bracket is found
if s_list[i] == '<':
if s_list[i+1] == 'b' and s_list[i+2] == 'r' and s_list[i+3] == '>':
i=i+1
print ("hello")
continue
while s_list[i] != '>':
#pop everything from the the left-angle bracket until the right-angle bracket
s_list.pop(i)
#pops the right-angle bracket, too
s_list.pop(i)
elif s_list[i] == '\n':
s_list.pop(i)
else:
i=i+1
#convert the list back into text
join_char=''
return (join_char.join(s_list))#.replace("<br>","\n")
def getBullets(content):
mainSoup = BeautifulSoup(contents, "html.parser")
# Gets empty bullets
def getAllBullets(content):
mainSoup = BeautifulSoup(str(content), "html.parser")
subcategories = mainSoup.findAll('div',attrs={"class" : "CategoryTreeItem"})
empty = []
full = []
for x in subcategories:
subSoup = BeautifulSoup(str(x))
link = str(subSoup.findAll('a')[0])
if (str(x)).count("CategoryTreeEmptyBullet") > 0:
empty.append(clean(link).replace(" ","_"))
elif (str(x)).count("CategoryTreeBullet") > 0:
full.append(clean(link).replace(" ","_"))
return((empty,full))
def printTree(catName, count):
catName = catName.replace("\\'","'")
if count == MAX_DEPTH : return
download(catRoot+catName, path)
filepath = "categories/Category:"+catName+".html"
print(filepath)
content = open('filepath', 'w+')
content.readlines()
(emptyBullets,fullBullets) = getAllBullets(content)
f.close()
for x in emptyBullets:
for i in range(count):
print (" "),
download(catRoot+x, "categories/Category:"+x+".html")
print (x)
for x in fullBullets:
for i in range(count):
print (" "),
print (x)
if x in done:
print ("Done... "+x)
continue
done.append(x)
try: printTree(x, count + 1)
except:
print ("ERROR: " + x)
name = "Cricket"
printTree(name, 0)
The error encountered is as follows.
I think f.close() should be content.close().
It's common to use a context manager for such cases, though, like this:
with open(filepath, 'w+') as content:
(emptyBullets,fullBullets) = getAllBullets(content)
Then Python will close the file for you, even in case of an exception.
(I also changed 'filepath' to filepath, which I assume is the intent here.)
I have written a script that loops and prints the results, I am trying to add in saving to CSV, however I can't figure out how to do it.
I have got the code to save it which works on my other scripts but either it only prints one line, or prints 3 lines (one for each loop.
How do I do it so it prints all results?
This is the code I am working with
from selenium import webdriver
import time
browser = webdriver.Firefox(executable_path="/Users/**/Downloads/geckodriver")
browser.get('https://www.tripadvisor.co.uk/Restaurants-g186338-zfn29367-London_England.html#EATERY_OVERVIEW_BOX')
meci = browser.find_elements_by_class_name('listing')
filename ="scrape1.1.csv"
f = open(filename, 'w')
headers ="Title, URL, Rating\n "
f.write("")
while True:
try:
meci = browser.find_elements_by_class_name('listing')
for items in meci:
title_cont = items.find_element_by_class_name('property_title')
title = title_cont.text
href = title_cont.get_attribute('href')
rating = items.find_element_by_class_name('ui_bubble_rating')
ratingbubble = rating.get_attribute('alt').replace(' of 5 bubbles', '')
print(title)
print(href)
print(ratingbubble)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
except:
break
f.write(title + "," + href + "," + ratingbubble + "\n")
f.close()
browser.quit()
try this
from selenium import webdriver
import time
browser = webdriver.Firefox(executable_path="C:/Py/pythonv4/gecko/geckodriver")
browser.get('https://www.tripadvisor.co.uk/Restaurants-g186338-zfn29367-
London_England.html#EATERY_OVERVIEW_BOX')
meci = browser.find_elements_by_class_name('listing')
filename ="scrape1.1.csv"
f = open(filename, 'w')
headers ="Title, URL, Rating\n "
f.write("")
while True:
try:
meci = browser.find_elements_by_class_name('listing')
for items in meci:
title_cont = items.find_element_by_class_name('property_title')
title = title_cont.text
href = title_cont.get_attribute('href')
rating = items.find_element_by_class_name('ui_bubble_rating')
ratingbubble = rating.get_attribute('alt').replace(' of 5 bubbles', '')
print(title)
print(href)
print(ratingbubble)
f.write(title + "," + href + "," + ratingbubble + "\n")
time.sleep(5)
browser.find_element_by_css_selector('.next').click()
time.sleep(1)
except:
break
f.close()
browser.quit()
Running a program in cmd; the print function
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
Which returns: Scraping URL 1 of 400
Over and over till count ends.
What i'm trying to learn today, is printing 2 outcomes on 2 separate lines, over and over till loop ends.
Example:
Scraping URL 1 of 400 Where bold character is only thing changing
Then if the scraper finds a result in the list;
Adding Result 1 to CSV Where bold character is only thing changing
So far i have tried a few print commands, but it either overwrites the entire sentence on the same line;
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
print '\r' 'URL_FOUND' + str(index+1) + 'adding to CSV',
If i try to link to two print functions to an else argument, it will only print the first statement and the second is not acknowledged.
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
else:
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
print '\n' 'title'
Just wondering if anyone could point me in the right direction for printing two outcomes on 2 lines.
Full code below if required:
import requests
import csv
import datetime
import pandas as pd
import csv
from lxml import html
df = pd.read_excel("C:\Python27\Projects\REA_SCRAPER\\REA.xlsx", sheetname="REA")
dnc = df['Property']
dnc_list = list(dnc)
url_base = "https://www.realestate.com.au/property/"
URL_LIST = []
for nd in dnc_list:
nd = nd.strip()
nd = nd.lower()
nd = nd.replace(" ", "-")
URL_LIST.append(url_base + nd)
text2search = '''RECENTLY SOLD'''
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
I would have recommended curses, but you're on Windows and just writing what appears to be a small script; reason enough to not go down that rabbit hole.
The reason you are seeing your lines overwrite each other is because you are printing carriage returns \r, which moves the cursor to the start of the line. Any text written thereafter will overwrite previous printed text.
I found this with a quick Google, which may be of interest to you.
So I have this python script. Right now, I run the script and it gives me an output file in CSV.
What I want: When it finishes to restart and to check for changes to those output values (not refresh the output file when it restarts and erase all the previously collected data)
As well, it takes about 3 seconds per line of data to get retrieved. Does anyone know how I can get it going fast to handle large data sets?
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class':'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
#pass
return
ifile = open('output.csv', "a", 0)
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
inputs = csv.reader(open('input.csv'))
#inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
for i in inputs:
extract(i[0])
ifile.close()
print("finished")