This is an example of the code that i'm using without the website added.
from bs4 import BeautifulSoup
import requests
import csv
import random as rd
source = requests.get('http://example.com').text
file = open('C:/xampp/htdocs/new-site/text.php', 'w')
soup = BeautifulSoup(source, 'lxml')
header = soup.find('p', class_='StoryHead').text
Program_Title = 'Program Title'
CSS = '<link rel="stylesheet" href="./css.css">'
start = CSS + '<div id="yo">' + '<div id="first">' + '<h1>' + '\n' + '\n' + \
Program_Title + '\n' + header + '</h1>' + '</div id="first">'
csv_writer = csv.writer(file)
csv_writer.writerow( start )
here: some of the results from the code, but as you can see the code doubles (" ")
"<link rel=""stylesheet"" href=""./css.css""><div id=""yo""><div id=""first"">
<h1>
This looks to be coming from escaping of the double quotes by the csv writer.
The default quotechar must be (").
try:
csv_writer = csv.writer(file, delimiter=',', quotechar='\\')
EDIT: Updated parameters to csv.writer
change the CSV ability to write just to the default writing
file = open('C:/xampp/htdocs/new-site/text.php', 'w')
#content here
csv_writer = csv.writer(file)
csv_writer.writerow( start )
To this:
file = open('C:/xampp/htdocs/new-site/text.php', 'w')
#content here
file.write(start)
Related
I have this code which is all work fine with (one link )
Result of the code store values (availableOffers,otherpricess,currentprice,page_url) in (prices.csv) file
my problems are : First : I do not know what to write to fetch URLs form my (text file) or my (xml file) instead of one URL in this code
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "XXXXXXXXX"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
f = open(out_filename, "w")
f.write(headers)
f.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
f.close()
Second problem : when URL do not have value for (otherpricess ) I get this error
line 13, in <module>
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
AttributeError: 'NoneType' object has no attribute 'text'
how I bypass this error and tell the code to work even there are a value missing
thanks
To fetch urls from text file, you can open a file (exactly as you did for write) in "r" mode, and iterate over it's line.
For example, lets say you have the following urls file, named urls.txt:
http://www.google.com
http://www.yahoo.com
In order to fetch the urls and iterate over them, do the following:
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
with open(out_filename, "w") as fw:
fw.write(headers)
with open("urls.txt", "r") as fr:
for url in map(lambda x: x.strip(), fr.readlines()): # the strip is to remove the trailing '\n'
print(url)
uClient = uReq(url)
page_soup = soup(uClient.read(), "html.parser")
# write the rest logic here
# ...
# write to the output file
fw.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
Regarding your second question, you can check that page_soup.find("span", {"class": "price"}) is not None and if so, extract the text. For example:
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "") if page_soup.find("span", {"class": "price"}) else ""
# in case there is no value, otherpricess will be empty string but you can change it to any other value.
i have a script which reads a html file and from this file it extracts the relevant lines. but i have a problem when printing the filename. the file names are source1.html source2.html and source3.html. Instead it prints source2.html source3.html, source4.html.
from bs4 import BeautifulSoup
import re
import os.path
n = 1
filename = "source"+str(n)+".html"
savefile = open('OUTPUT.csv', 'w')
while os.path.isfile(filename):
n = n+1
strjpgs = "Extracted Layers: \n \n"
file = open(filename, "r")
filename = "source"+str(n)+".html"
soup = BeautifulSoup (file, "html.parser")
thedata = soup.find("div", class_="cplayer")
strdata = str(thedata)
DoRegEx = re.compile('/([^/]+)\.jpg')
jpgs = DoRegEx.findall(strdata)
strjpgs = strjpgs + "\n".join(jpgs) + "\n \n"
savefile.write(filename + '\n')
savefile.write(strjpgs)
print(filename)
print(strjpgs)
savefile.close()
print "done"
You define n as 1, then in your WHILE you immediately increment it to 2. By the time you get to the print(filename), n is 2 and filename has been changed to "Source2.html". Move the printing or move the variable increment.
You just need to move your print statement at the beginning of your loop, and your increment at the end, for the next iteration:
while os.path.isfile(filename):
print(filename)
strjpgs = "Extracted Layers: \n \n"
file = open(filename, "r")
filename = "source"+str(n)+".html"
soup = BeautifulSoup (file, "html.parser")
thedata = soup.find("div", class_="cplayer")
strdata = str(thedata)
DoRegEx = re.compile('/([^/]+)\.jpg')
jpgs = DoRegEx.findall(strdata)
strjpgs = strjpgs + "\n".join(jpgs) + "\n \n"
savefile.write(filename + '\n')
savefile.write(strjpgs)
n = n+1
print(strjpgs)
You made a logical mistake because you increment the variable n before you store it. Simplest solution will be define your variable to 0 instead of 1. Next mistake is that you never close the html files therefore use with open("filename", 'w') as file: this close your file automatically when out of scope and it is more pythonic.
from bs4 import BeautifulSoup
import re
import os.path
n = 1
filename = "source"+str(n)+".html"
savefile = open('OUTPUT.csv', 'w')
if os.path.isfile(filename):
strjpgs = "Extracted Layers: \n \n"
while True:
with open(filename, "r") as file:
filename = "source"+str(n)+".html"
# parsing things...
savefile.write(filename + '\n')
savefile.write(strjpgs)
print(filename)
print(strjpgs)
if filename == "source3.html":
break
else:
n+=1
savefile.close()
print ("done")
Running a program in cmd; the print function
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
Which returns: Scraping URL 1 of 400
Over and over till count ends.
What i'm trying to learn today, is printing 2 outcomes on 2 separate lines, over and over till loop ends.
Example:
Scraping URL 1 of 400 Where bold character is only thing changing
Then if the scraper finds a result in the list;
Adding Result 1 to CSV Where bold character is only thing changing
So far i have tried a few print commands, but it either overwrites the entire sentence on the same line;
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
print '\r' 'URL_FOUND' + str(index+1) + 'adding to CSV',
If i try to link to two print functions to an else argument, it will only print the first statement and the second is not acknowledged.
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
else:
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
print '\n' 'title'
Just wondering if anyone could point me in the right direction for printing two outcomes on 2 lines.
Full code below if required:
import requests
import csv
import datetime
import pandas as pd
import csv
from lxml import html
df = pd.read_excel("C:\Python27\Projects\REA_SCRAPER\\REA.xlsx", sheetname="REA")
dnc = df['Property']
dnc_list = list(dnc)
url_base = "https://www.realestate.com.au/property/"
URL_LIST = []
for nd in dnc_list:
nd = nd.strip()
nd = nd.lower()
nd = nd.replace(" ", "-")
URL_LIST.append(url_base + nd)
text2search = '''RECENTLY SOLD'''
with open('test1.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(price,) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
I would have recommended curses, but you're on Windows and just writing what appears to be a small script; reason enough to not go down that rabbit hole.
The reason you are seeing your lines overwrite each other is because you are printing carriage returns \r, which moves the cursor to the start of the line. Any text written thereafter will overwrite previous printed text.
I found this with a quick Google, which may be of interest to you.
I am trying to extract a review from one page in Zomato using request and Beautiful Soup 4 in Python. I want to store the link of the requested page and the review extracted into one csv file.
My problem is that the review I extracted does not store into one cell but instead it splits into multiple cells. How do I store my extracted review into one cell?
Here is my code:
import time
from bs4 import BeautifulSoup
import requests
URL = "https://www.zomato.com/review/eQEygl"
time.sleep(2)
reviewPage = requests.get(URL, headers = {'user-agent': 'my-app/0.0.1'})
reviewSoup = BeautifulSoup(reviewPage.content,"html.parser")
reviewText = reviewSoup.find("div",{"class":"rev-text"})
textSoup = BeautifulSoup(str(reviewText), "html.parser")
reviewElem = [URL, ""]
for string in textSoup.stripped_strings:
reviewElem[1] += string
csv = open("out.csv", "w", encoding="utf-8")
csv.write("Link, Review\n")
row = reviewElem[0] + "," + reviewElem[1] + "\n"
csv.write(row)
csv.close()
Output
Expected Output
I think the problem is the commas embedded in the reviewElem[1] string, because they are the default delimiter in most CSV software. The following avoids the problem by wrapping the contents of the string in " characters to indicate it's all one cell:
import time
from bs4 import BeautifulSoup
import requests
URL = "https://www.zomato.com/review/eQEygl"
time.sleep(2)
reviewPage = requests.get(URL, headers = {'user-agent': 'my-app/0.0.1'})
reviewSoup = BeautifulSoup(reviewPage.content,"html.parser")
reviewText = reviewSoup.find("div",{"class":"rev-text"})
textSoup = BeautifulSoup(str(reviewText), "html.parser")
reviewElem = [URL, ""]
for string in textSoup.stripped_strings:
reviewElem[1] += string
csv = open("out.csv", "w", encoding="utf-8")
csv.write("Link, Review\n")
#row = reviewElem[0] + "," + reviewElem[1] + "\n"
row = reviewElem[0] + ',"{}"\n'.format(reviewElem[1]) # quote string 2
csv.write(row)
csv.close()
There is no need to manually construct a CSV string. When you do it manually, if there are column delimiters (, by default) inside the column values, they are interpreted as delimiters and not literal strings leading to a column value being scattered around multiple columns.
Use the csv module and the .writerow() method:
import csv
# ...
with open("out.csv", "w", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["Link", "Review"])
writer.writerow(reviewElem)
I need some help to save the output from a basic web scraper to a CSV file.
Here is the code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html_ = urlopen("some_url")
bsObj_ = BeautifulSoup(html_, "html.parser")
nameList_ = bsObj_2.findAll("div", {"class":"row proyecto_name_venta"})
for name in nameList_:
print(name.get_text())
Specifically, I want to save the name.get_text() result in a CSV file.
If the elements in nameList_ are rows with the columns delimited by ',' try this:
import csv
with open('out.csv', 'w') as outf:
writer = csv.writer(outf)
writer.writerows(name.get_text().split(',') for name nameList_)
If nameList_.get_text() is just a string and you want to write a single column CSV, you might try this:
import csv
with open('out.csv', 'w') as outf:
writer = csv.writer(outf)
writer.writerows([name.get_text()] for name in nameList_)
This is a pretty comprehensive example of what you asked for . . . .
import urllib2
listOfStocks = ["AAPL", "MSFT", "GOOG", "FB", "AMZN"]
urls = []
for company in listOfStocks:
urls.append('http://real-chart.finance.yahoo.com/table.csv?s=' + company + '&d=6&e=28&f=2015&g=m&a=11&b=12&c=1980&ignore=.csv')
Output_File = open('C:/Users/rshuell001/Historical_Prices.csv','w')
New_Format_Data = ''
for counter in range(0, len(urls)):
Original_Data = urllib2.urlopen(urls[counter]).read()
if counter == 0:
New_Format_Data = "Company," + urllib2.urlopen(urls[counter]).readline()
rows = Original_Data.splitlines(1)
for row in range(1, len(rows)):
New_Format_Data = New_Format_Data + listOfStocks[counter] + ',' + rows[row]
Output_File.write(New_Format_Data)
Output_File.close()