Python print two outcomes on same lines over and over - python

Running a program in cmd; the print function
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
Which returns: Scraping URL 1 of 400
Over and over till count ends.
What i'm trying to learn today, is printing 2 outcomes on 2 separate lines, over and over till loop ends.
Example:
Scraping URL 1 of 400 Where bold character is only thing changing
Then if the scraper finds a result in the list;
Adding Result 1 to CSV Where bold character is only thing changing
So far i have tried a few print commands, but it either overwrites the entire sentence on the same line;
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
print '\r' 'URL_FOUND' + str(index+1) + 'adding to CSV',
If i try to link to two print functions to an else argument, it will only print the first statement and the second is not acknowledged.
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
else:
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
print '\n' 'title'
Just wondering if anyone could point me in the right direction for printing two outcomes on 2 lines.
Full code below if required:
import requests
import csv
import datetime
import pandas as pd
import csv
from lxml import html
df = pd.read_excel("C:\Python27\Projects\REA_SCRAPER\\REA.xlsx", sheetname="REA")
dnc = df['Property']
dnc_list = list(dnc)
url_base = "https://www.realestate.com.au/property/"
URL_LIST = []
for nd in dnc_list:
nd = nd.strip()
nd = nd.lower()
nd = nd.replace(" ", "-")
URL_LIST.append(url_base + nd)
text2search = '''RECENTLY SOLD'''
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])

I would have recommended curses, but you're on Windows and just writing what appears to be a small script; reason enough to not go down that rabbit hole.
The reason you are seeing your lines overwrite each other is because you are printing carriage returns \r, which moves the cursor to the start of the line. Any text written thereafter will overwrite previous printed text.
I found this with a quick Google, which may be of interest to you.

Related

While using a web scraper how can I make sure that once the 1st page has been scraped it will then scrape the 2nd page?

I am looking to output the body of each review from the site. I am getting the correct output for the first page, but if there are 4 pages of reviews I get the text from the first page 4 times. How can I make sure that the scraper moves to the next page each time?
import lxml.html as html
import math
import csv
import requests
import re
import time
# Trustpilot review page
basePage = 'http://www.trustpilot.com/review/'
reviewSite = 'www.boo-hoo.com'
reviewPage = basePage + reviewSite
# Data file to save to
datafile = 'datascrap.csv'
# Trustpilot default
resultsPerPage = 20
print('Scraper set for ' + reviewPage + ' - saving result to ' + datafile)
# Get page, skipping HTTPS as it gives certificate errors
page = requests.get(reviewPage, verify=False)
tree = html.fromstring(page.content)
# Total amount of ratings
ratingCount = tree.xpath('//h2[#class="header--inline"]')
ratingCount = ratingCount[0].text.replace(',','')
ratingCount = ratingCount.replace(u'\xa0', u'')
ratingCount = ratingCount.replace(u'\n', u'')
ratingCount = ratingCount.replace(u'Average', u'')
ratingCount = ratingCount.replace(u' ', '')
ratingCount = ratingCount.replace(u'•', '')
ratingCount = ratingCount.replace(u'Great', '')
ratingCount = int(ratingCount)
# Amount of chunks to consider for displaying processing output
# For ex. 10 means output progress for every 10th of the data
tot_chunks = 20
# Throttling to avoid spamming page with requests
# With sleepTime seconds between every page request
throttle = True
sleepTime = 2
# Total pages to scrape
pages = math.ceil(ratingCount / resultsPerPage)
print('Found total of ' + str(pages) + ' pages to scrape')
with open(datafile, 'w', newline='', encoding='utf8') as csvfile:
# Tab delimited to allow for special characters
datawriter = csv.writer(csvfile, delimiter='\t')
print('Processing..')
for i in range(1, pages + 1):
if (throttle): time.sleep(sleepTime)
page = requests.get(reviewPage + '?page=' + str(i))
tree = html.fromstring(page.content)
# The item below scrapes a review body.
bodies = tree.xpath('//p[#class="review-content__text"]')
for idx, e in enumerate(bodies):
# Progress counting, outputs for every processed chunk
reviewNumber = idx + 20 * (i - 1) + 1
chunk = int(ratingCount / tot_chunks)
if reviewNumber % chunk == 0:
print('Processed ' + str(reviewNumber) + '/' + str(ratingCount) + ' ratings')
# Body of comment
body = e.text_content().strip()
datawriter.writerow([body])
print('Processed ' + str(ratingCount) + '/' + str(ratingCount) + ' ratings.. Finished!')
If, for example, the site has 80 reviews I will get the first 20 four times but when I have tried printing the page each time through it shows it is going to 1, 2, 3, etc.
reviewSite was incorrect. change from reviewSite = 'www.boo-hoo.com' to reviewSite = 'boo-hoo.com'
if you go to page 2 in the browser, you'll see it as:
https://www.trustpilot.com/review/boo-hoo.com?page=2
but you're concatenating www.boo-hoo.com, so it's incorrectly trying to go to:
https://www.trustpilot.com/review/www.boo-hoo.com?page=2
which then defaults to first page

python is doubling (" ") to ("" "") how can i fix this

This is an example of the code that i'm using without the website added.
from bs4 import BeautifulSoup
import requests
import csv
import random as rd
source = requests.get('http://example.com').text
file = open('C:/xampp/htdocs/new-site/text.php', 'w')
soup = BeautifulSoup(source, 'lxml')
header = soup.find('p', class_='StoryHead').text
Program_Title = 'Program Title'
CSS = '<link rel="stylesheet" href="./css.css">'
start = CSS + '<div id="yo">' + '<div id="first">' + '<h1>' + '\n' + '\n' + \
Program_Title + '\n' + header + '</h1>' + '</div id="first">'
csv_writer = csv.writer(file)
csv_writer.writerow( start )
here: some of the results from the code, but as you can see the code doubles (" ")
"<link rel=""stylesheet"" href=""./css.css""><div id=""yo""><div id=""first"">
<h1>
This looks to be coming from escaping of the double quotes by the csv writer.
The default quotechar must be (").
try:
csv_writer = csv.writer(file, delimiter=',', quotechar='\\')
EDIT: Updated parameters to csv.writer
change the CSV ability to write just to the default writing
file = open('C:/xampp/htdocs/new-site/text.php', 'w')
#content here
csv_writer = csv.writer(file)
csv_writer.writerow( start )
To this:
file = open('C:/xampp/htdocs/new-site/text.php', 'w')
#content here
file.write(start)

Value of one variable is automatically assigned to second one

I am learning web-scraping but while formatting the scraped data I came across a problem that my two variables i.e. first_line and second_line are both showing same value and that value is of second_line.
Inside the else when I tried printing out first_line then I got expected result but outside if and else first_line is showing copied value from second_line
while current_page < 201:
print(current_page)
url = base_url + loc + "&start=" + str(current_page)
yelp_r = requests.get(url)
yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser')
file_path = 'yelp-{loc}-2.txt'.format(loc=loc)
with open(file_path, "a") as textfile:
business = yelp_soup.findAll('div',{'class':'biz-listing-large'})
for biz in business:
title = biz.findAll('a', {'class':'biz-name'})[0].text
print(title)
second_line = ""
first_line = ""
try:
address = biz.findAll('address')[0].contents
for item in address:
if "br" in str(item):
second_line = second_line + item.getText()
else:
first_line = item.strip(" \n\t\r")
print(first_line)
print(first_line)
print(second_line)
except:
pass
print('\n')
try:
phone = biz.findAll('span',{'class':'biz-phone'})[0].text
except:
phone = None
print(phone)
page_line = "{title}\n{address_1}\n{address_2}\n{phone}".format(
title=title,
address_1=first_line,
address_2=second_line,
phone=phone
)
textfile.write(page_line)
current_page += 10
If you call .get_text() on a node, it gives you the full text. You can then split on newline to get your first and second line:
first_line, second_line = biz.findAll('address')[0].get_text().split('\n')
But since you just print f'{first_line}\n{second_line}', why do you need them separate at all?

Python - scraping a paginated site and writing the results to a file

I am a complete programming beginner, so please forgive me if I am not able to express my problem very well. I am trying to write a script that will look through a series of pages of news and will record the article titles and their links. I have managed to get that done for the first page, the problem is getting the content of the subsequent pages. By searching in stackoverflow, I think I managed to find a solution that will make the script access more than one URL BUT it seems to be overwriting the content extracted from each page it accesses so I always end up with the same number of recorded articles in the file. Something that might help: I know that URLs follow the following model: "/ultimas/?page=1", "/ultimas/?page=2", etc. and it appears to be using AJAX to request new articles
Here is my code:
import csv
import requests
from bs4 import BeautifulSoup as Soup
import urllib
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
letters[0]
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
letters[0].a["href"]
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
import os, csv
os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
import json
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
Any help on how I might go about adding the content of each page to the final file would be very appreciated. Thank you!
How about this one if serving the same purpose:
import csv, requests
from lxml import html
base_url = "http://agenciabrasil.ebc.com.br"
program_url = base_url + "/ultimas/?page={0}"
outfile = open('scraped_data.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Caption","Link"])
for url in [program_url.format(page) for page in range(1, 4)]:
response = requests.get(url)
tree = html.fromstring(response.text)
for title in tree.xpath("//div[#class='noticia']"):
caption = title.xpath('.//span[#class="field-content"]/a/text()')[0]
policy = title.xpath('.//span[#class="field-content"]/a/#href')[0]
writer.writerow([caption , base_url + policy])
It looks like the code in your for loop (for page in range(1, 4):) isn't been called due to your file not been correctly indented:
If you tidy up your code, it works:
import csv, requests, os, json, urllib
from bs4 import BeautifulSoup as Soup
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
#os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"

Scraping Python Script: Looping and updating output

So I have this python script. Right now, I run the script and it gives me an output file in CSV.
What I want: When it finishes to restart and to check for changes to those output values (not refresh the output file when it restarts and erase all the previously collected data)
As well, it takes about 3 seconds per line of data to get retrieved. Does anyone know how I can get it going fast to handle large data sets?
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class':'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
#pass
return
ifile = open('output.csv', "a", 0)
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
inputs = csv.reader(open('input.csv'))
#inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
for i in inputs:
extract(i[0])
ifile.close()
print("finished")

Categories

Resources