I'm working on a web parser for a webpage containing mathematical constants. I need to replace some characters in order to have it on a specific format, but I dont know why if I print it, i seems to be working fine; but when I open the output file the format achieved by replace() doesn't seems to have took effect.
That's the code
#!/usr/bin/env python3
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "http://www.ebyte.it/library/educards/constants/ConstantsOfPhysicsAndMath.html"
soup = BeautifulSoup(urlopen(url).read(), "html5lib")
f = open("ebyteParse-output.txt", "w")
table = soup.find("table", attrs={"class": "grid9"})
rows = table.findAll("tr")
for tr in rows:
# If its a category of constants we write that as a comment
if tr.has_attr("bgcolor"):
f.write("\n\n# " + tr.find(text=True) + "\n")
continue
cols = tr.findAll("td")
if (len(cols) >= 2):
if (cols[0]["class"][0] == "box" or cols[0]["class"][0] == "boxi" and cols[1]["class"][0] == "boxa"):
constant = str(cols[0].find(text=True)).replace(" ", "-")
value = str(cols[1].find(text=True))
value = value.replace(" ", "").replace("...", "").replace("[", "").replace("]", "")
print(constant + "\t" + value)
f.write(constant + "\t" + value)
f.write("\n")
f.close()
That is what print shows:
That is what I get on the output file
Thanks you,
Salva
File i was looking for was catched so no changes where seen. Thanks for answering
Related
I would like the script to scrape all items from each page and append to a csv file but there are 2 problem :
1) When I run the script it only go to single page (the last page = 64). It doesn't crawl from page 1 until 64
2) When the script writes data to csv file it doesn't append new lines but it re-writes the whole csv file.
import csv
# YouTube Video: https://www.youtube.com/watch?v=zjo9yFHoUl8
from selenium import webdriver
MAX_PAGE_NUM = 67
MAX_PAGE_DIG = 1
driver = webdriver.Chrome('/Users/reezalaq/PycharmProjects/untitled2/venv/driver/chromedriver')
with open('result.csv', 'w') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
for i in range(1, MAX_PAGE_NUM + 1):
page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
url = "https://www.blibli.com/jual/batik-pria?s=batik+pria&c=BA-1000013&i=" + page_num
driver.get(url)
buyers = driver.find_elements_by_xpath("//div[#class='product-title']")
prices = driver.find_elements_by_xpath("//span[#class='new-price-text']")
discount = driver.find_elements_by_xpath("//div[#class='discount']")
oldprice = driver.find_elements_by_xpath("//span[#class='old-price-text']")
num_page_items = len(buyers)
with open('result.csv', 'a') as f:
for c in range(num_page_items):
f.write(buyers[c].text + ' , ' + prices[c].text + ' , ' + discount[c].text + ' , ' + oldprice[c].text + '\n')
driver.close()
The main issue you had is an indentation problem that was just running your script using the last object found on the page.
Another issue I saw is that you were just putting all the titiles together, all the old prices together and so on.
For this reason it will be difficult to understand which price belongs to which item in case, for example, of items with missing data.
To solve this issue I've put all the items in a single webpage into the variable "products".
About the "append" or "write" option of the CSV in my implementation I check as first thing if the result.csv file exists.
Then we have two cases:
result.csv doesn't exist: I create it and I put headers in
result.csv already exists: it means that header is already in place and I can simply append new rows when looping
In order to get data out easily I've used BeautifulSoup (install it easily with pip).
There are still several challenges ahead because the data in this webpage is not consistent but the following example should be enough to get you going.
Please keep in mind that the "break" in the code will stop the scraping at the 1st page.
import csv
# YouTube Video: https://www.youtube.com/watch?v=zjo9yFHoUl8
from selenium import webdriver
from bs4 import BeautifulSoup
import os.path
MAX_PAGE_NUM = 67
MAX_PAGE_DIG = 1
driver = webdriver.Chrome('/Users/reezalaq/PycharmProjects/untitled2/venv/driver/chromedriver')
#driver = webdriver.Chrome()
def write_csv_header():
with open('result.csv', 'w') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
def write_csv_row(product_title, product_new_price, product_discount, product_old_price, product_link):
with open('result.csv', 'a') as f:
f.write(product_title + ' , ' + product_new_price + ' , ' + product_discount + ' , ' + product_old_price + ' , ' + product_link + '\n')
if os.path.isfile('result.csv'):
write_csv_header()
for i in range(1, MAX_PAGE_NUM + 1):
page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
url = "https://www.blibli.com/jual/batik-pria?s=batik+pria&c=BA-1000013&i=" + page_num
driver.get(url)
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
products = soup.findAll("a", {"class": "single-product"})
for product in products:
try:
product_title = product.find("div", {"class": "product-title"}).text.strip()
except:
product_title = "Not available"
try:
product_new_price = product.find("span", {"class": "new-price-text"}).text.strip()
except:
product_new_price = "Not available"
try:
product_old_price = product.find("span", {"class": "old-price-text"}).text.strip()
except:
product_old_price = "Not available"
try:
product_discount = product.find("div", {"class": "discount"}).text.strip()
except:
product_discount = "Not available"
try:
product_link = product['href']
except:
product_link = "Not available"
write_csv_row(product_title, product_new_price, product_discount, product_old_price, product_link)
break # this stops the parsing at the 1st page. I think it is a good idea to check data and fix all discrepancies before proceeding
driver.close()
If you want to append a new line to the file, you must use "a" argument instead of "w".
with open('result.csv', 'a') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
definition of "w" option:
Opens a file for writing only. Overwrites the file if the file exists.
If the file does not exist, creates a new file for writing.
definition of "a" option:
Opens a file for appending. The file pointer is at the end of the file
if the file exists. That is, the file is in the append mode. If the
file does not exist, it creates a new file for writing.
definition of "ab" option:
Opens a file for appending in binary format. The file pointer is at
the end of the file if the file exists. That is, the file is in the
append mode. If the file does not exist, it creates a new file for
writing.
Therefore, for appending new lines, you must use options that contain "a" (appending option).
Definitions are represented on this answer.
I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.
I am a complete programming beginner, so please forgive me if I am not able to express my problem very well. I am trying to write a script that will look through a series of pages of news and will record the article titles and their links. I have managed to get that done for the first page, the problem is getting the content of the subsequent pages. By searching in stackoverflow, I think I managed to find a solution that will make the script access more than one URL BUT it seems to be overwriting the content extracted from each page it accesses so I always end up with the same number of recorded articles in the file. Something that might help: I know that URLs follow the following model: "/ultimas/?page=1", "/ultimas/?page=2", etc. and it appears to be using AJAX to request new articles
Here is my code:
import csv
import requests
from bs4 import BeautifulSoup as Soup
import urllib
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
letters[0]
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
letters[0].a["href"]
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
import os, csv
os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
import json
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
Any help on how I might go about adding the content of each page to the final file would be very appreciated. Thank you!
How about this one if serving the same purpose:
import csv, requests
from lxml import html
base_url = "http://agenciabrasil.ebc.com.br"
program_url = base_url + "/ultimas/?page={0}"
outfile = open('scraped_data.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Caption","Link"])
for url in [program_url.format(page) for page in range(1, 4)]:
response = requests.get(url)
tree = html.fromstring(response.text)
for title in tree.xpath("//div[#class='noticia']"):
caption = title.xpath('.//span[#class="field-content"]/a/text()')[0]
policy = title.xpath('.//span[#class="field-content"]/a/#href')[0]
writer.writerow([caption , base_url + policy])
It looks like the code in your for loop (for page in range(1, 4):) isn't been called due to your file not been correctly indented:
If you tidy up your code, it works:
import csv, requests, os, json, urllib
from bs4 import BeautifulSoup as Soup
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
#os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
So I have this python script. Right now, I run the script and it gives me an output file in CSV.
What I want: When it finishes to restart and to check for changes to those output values (not refresh the output file when it restarts and erase all the previously collected data)
As well, it takes about 3 seconds per line of data to get retrieved. Does anyone know how I can get it going fast to handle large data sets?
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class':'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
#pass
return
ifile = open('output.csv', "a", 0)
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
inputs = csv.reader(open('input.csv'))
#inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
for i in inputs:
extract(i[0])
ifile.close()
print("finished")
I'm trying to scrape temperatures from a weather site using the following:
import urllib2
from BeautifulSoup import BeautifulSoup
f = open('airport_temp.tsv', 'w')
f.write("Location" + "\t" + "High Temp (F)" + "\t" + "Low Temp (F)" + "\t" + "Mean Humidity" + "\n" )
eventually parse from http://www.wunderground.com/history/airport/\w{4}/2012/\d{2}/1/DailyHistory.html
for x in range(10):
locationstamp = "Location " + str(x)
print "Getting data for " + locationstamp
url = 'http://www.wunderground.com/history/airport/KAPA/2013/3/1/DailyHistory.html'
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
location = soup.findAll('h1').text
locsent = location.split()
loc = str(locsent[3,6])
hightemp = soup.findAll('nobr')[6].text
htemp = hightemp.split()
ht = str(htemp[1])
lowtemp = soup.findAll('nobr')[10].text
ltemp = lowtemp.split()
lt = str(ltemp[1])
avghum = soup.findAll('td')[23].text
f.write(loc + "\t|" + ht + "\t|" + lt + "\t|" + avghum + "\n" )
f.close()
Unfortunately, I get an error saying:
Getting data for Location 0
Traceback (most recent call last):
File "airportweather.py", line 18, in <module>
location = soup.findAll('H1').text
AttributeError: 'list' object has no attribute 'text'
I've looked through BS and Python documentation, but am still pretty green, so I couldn't figure it out. Please help this newbie!
The .findAll() method returns a list of matches. If you wanted one result, use the .find() method instead. Alternatively, pick out a specific element like the rest of the code does, or loop over the results:
location = soup.find('h1').text
or
locations = [el.text for el in soup.findAll('h1')]
or
location = soup.findAll('h1')[2].text
This is quite simple. findAll returns list, so if you are sure that there is only one interesting you element then: soup.findAll('H1')[0].text should work