#martineau I have updated my codes, is this what you meant ? How do i handle KeyError instead of NameError ?
url = "http://app2.nea.gov.sg/anti-pollution-radiation-protection/air-pollution/psi/psi-readings-over-the-last-24-hours"
web_soup = soup(urllib2.urlopen(url))
table = web_soup.find(name="div", attrs={'class': 'c1'}).find_all(name="div")[4].find_all('table')[0]
data = {}
cur_time = datetime.datetime.strptime("12AM", "%I%p")
for tr_index, tr in enumerate(table.find_all('tr')):
if 'Time' in tr.text:
continue
for td_index, td in enumerate(tr.find_all('td')):
if not td_index:
continue
data[cur_time] = td.text.strip()
if td.find('strong'):
bold_time = cur_time
data[bold_time] = '20'
cur_time += datetime.timedelta(hours=1)
default_value = '20' # whatever you want it to be
try:
bold = data[bold_time]
except NameError:
bold_time = beforebold = beforebeforebold = default_value
# might want to set "bold" to something, too, if needed
else:
beforebold = data.get(bold_time - datetime.timedelta(hours=1))
beforebeforebold = data.get(bold_time - datetime.timedelta(hours=2))
This is where I print my data to do calculation.
print bold
print beforebold
print beforebeforebold
You need to add something to set data[bold_time]:
if td.find('strong'):
bold_time = cur_time
data[bold_time] = ????? # whatever it should be
cur_time += datetime.timedelta(hours=1)
This should avoid both the NameError and KeyError exceptions as long as the word strong is found. You still might want to code defensively and handle one or both of them gracefully. That what exception where meant to do, handle those exceptional cases that shouldn't happen...
I had read your previous post before it disappeared, and then I've read this one.
I find it a pity to use BeautifulSoup for your goal, because, from the code I see, I find its use complicated, and the fact is that regexes run roughly 10 times faster than BeautifulSoup.
Here's the code with only re, that furnishes the data you are interested in.
I know, there will people to say that HTML text can't be parsed by regexs. I know, I know... but I don't parse the text, I directly find the chunks of text that are interesting. The source code of the webpage of this site is apparently very well structured and it seems there is little risk of bugs. Moreover, tests and verification can be added to keep watch on the source code and to be instantly informed on the possible changings made by the webmaster in the webpage
import re
from httplib import HTTPConnection
hypr = HTTPConnection(host='app2.nea.gov.sg',
timeout = 300)
rekete = ('/anti-pollution-radiation-protection/'
'air-pollution/psi/'
'psi-readings-over-the-last-24-hours')
hypr.request('GET',rekete)
page = hypr.getresponse().read()
patime = ('PSI Readings.+?'
'width="\d+%" align="center">\r\n'
' *<strong>Time</strong>\r\n'
' *</td>\r\n'
'((?: *<td width="\d+%" align="center">'
'<strong>\d+AM</strong>\r\n'
' *</td>\r\n)+.+?)'
'width="\d+%" align="center">\r\n'
' *<strong>Time</strong>\r\n'
' *</td>\r\n'
'((?: *<td width="\d+%" align="center">'
'<strong>\d+PM</strong>\r\n'
' *</td>\r\n)+.+?)'
'PM2.5 Concentration')
rgxtime = re.compile(patime,re.DOTALL)
patline = ('<td align="center">\r\n'
' *<strong>' # next line = group 1
'(North|South|East|West|Central|Overall Singapore)'
'</strong>\r\n'
' *</td>\r\n'
'((?: *<td align="center">\r\n' # group 2 start
' *[.\d-]+\r\n' #
' *</td>\r\n)*)' # group 2 end
' *<td align="center">\r\n'
' *<strong style[^>]+>'
'([.\d-]+)' # group 3
'</strong>\r\n'
' *</td>\r\n')
rgxline = re.compile(patline)
rgxnb = re.compile('<td align="center">\r\n'
' *([.\d-]+)\r\n'
' *</td>\r\n')
m= rgxtime.search(page)
a,b = m.span(1) # m.group(1) contains the data AM
d = dict((mat.group(1),
rgxnb.findall(mat.group(2))+[mat.group(3)])
for mat in rgxline.finditer(page[a:b]))
a,b = m.span(2) # m.group(2) contains the data PM
for mat in rgxline.finditer(page[a:b]):
d[mat.group(1)].extend(rgxnb.findall(mat.group(2))+[mat.group(3)])
print 'last 3 values'
for k,v in d.iteritems():
print '%s : %s' % (k,v[-3:])
Related
so far i have this coding which works, only the where clause is not working. I can pass in the where clause whatever i want, it is not considered:
import win32com.client
a = win32com.client.Dispatch("Access.Application")
path = r'C:\Users\Egon\Documents\Kassenbuch_py\Pro Version\kassenbuch.accdb'
filename =r'C:\Users\Egon\Documents\Kassenbuch_py\Pro Version\pdf\kassenbuch.pdf'
# db = a.CloseCurrentDatabase()
db = a.OpenCurrentDatabase(path)
ReportName = 'kassenbuch'
try:
StartDate = '#2021/06/13#'
EndDate = '#2021/06/14#'
acView = 'acViewPreview'
where_cl = '[DATUM] BETWEEN ' + StartDate + ' AND ' + EndDate
# where_cl = '"[EINNAHME] = 200"'
a.DoCmd.OpenReport(ReportName, 2, where_cl ,3)
# a.DoCmd.OpenReport(ReportName, 2, where_cl ,3)
a.DoCmd.Save(3, ReportName)
# a.visible = 1
a.DoCmd.OutputTo(3, ReportName, r'PDF Format (*.pdf)', filename)
except Exception as e:
QMessageBox.warning(self, 'Fehler', str(e), QMessageBox.Ok)
a.DoCmd.CloseDatabase
a.Quit()
a=None
Well, that's because you're passing a WHERE condition as the third argument, the filtername. Review the docs.
If you pass an empty string there and pass your WHERE condition as the WHERE condition , it'll work:
a.DoCmd.OpenReport(ReportName, 2, '', where_cl ,3)
Note that the DoCmd.Save should not really be there, it does nothing since the design of the report wasn't changed.
Hi everyone this is my first time here, and I am a beginner in Python. I am in the middle of writing a program that returns a txt document containing information about a stock (Watchlist Info.txt), based on the input of another txt document containing the company names (Watchlist).
To achieve this, I have written 3 functions, of which 2 functions reuters_ticker() and stock_price() are completed as shown below:
def reuters_ticker(desired_search):
#from company name execute google search for and return reuters stock ticker
try:
from googlesearch import search
except ImportError:
print('No module named google found')
query = desired_search + ' reuters'
for j in search(query, tld="com.sg", num=1, stop=1, pause=2):
result = j
ticker = re.search(r'\w+\.\w+$', result)
return ticker.group()
Stock Price:
def stock_price(company, doc=None):
ticker = reuters_ticker(company)
request = 'https://www.reuters.com/companies/' + ticker
raw_main = pd.read_html(request)
data1 = raw_main[0]
data1.set_index(0, inplace=True)
data1 = data1.transpose()
data2 = raw_main[1]
data2.set_index(0, inplace=True)
data2 = data2.transpose()
stock_info = pd.concat([data1,data2], axis=1)
if doc == None:
print(company + '\n')
print('Previous Close: ' + str(stock_info['Previous Close'][1]))
print('Forward PE: ' + str(stock_info['Forward P/E'][1]))
print('Div Yield(%): ' + str(stock_info['Dividend (Yield %)'][1]))
else:
from datetime import date
with open(doc, 'a') as output:
output.write(date.today().strftime('%d/%m/%y') + '\t' + str(stock_info['Previous Close'][1]) + '\t' + str(stock_info['Forward P/E'][1]) + '\t' + '\t' + str(stock_info['Dividend (Yield %)'][1]) + '\n')
output.close()
The 3rd function, watchlist_report(), is where I am getting problems with writing the information in the format as desired.
def watchlist_report(watchlist):
with open(watchlist, 'r') as companies, open('Watchlist Info.txt', 'a') as output:
searches = companies.read()
x = searches.split('\n')
for i in x:
output.write(i + ':\n')
stock_price(i, doc='Watchlist Info.txt')
output.write('\n')
When I run watchlist_report('Watchlist.txt'), where Watchlist.txt contains 'Apple' and 'Facebook' each on new lines, my output is this:
26/04/20 275.03 22.26 1.12
26/04/20 185.13 24.72 --
Apple:
Facebook:
Instead of what I want and would expect based on the code I have written in watchlist_report():
Apple:
26/04/20 275.03 22.26 1.12
Facebook:
26/04/20 185.13 24.72 --
Therefore, my questions are:
1) Why is my output formatted this way?
2) Which part of my code do I have to change to make the written output in my desired format?
Any other suggestions about how I can clean my code and any libraries I can use to make my code nicer are also appreciated!
You handle two different file-handles - the file-handle inside your watchlist_report gets closed earlier so its being written first, before the outer functions file-handle gets closed, flushed and written.
Instead of creating a new open(..) in your function, pass the current file handle:
def watchlist_report(watchlist):
with open(watchlist, 'r') as companies, open('Watchlist Info.txt', 'a') as output:
searches = companies.read()
x = searches.split('\n')
for i in x:
output.write(i + ':\n')
stock_price(i, doc = output) # pass the file handle
output.write('\n')
Inside def stock_price(company, doc=None): use the provided filehandle:
def stock_price(company, output = None): # changed name here
# [snip] - removed unrelated code for this answer for brevity sake
if output is None: # check for None using IS
print( ... ) # print whatever you like here
else:
from datetime import date
output.write( .... ) # write whatever you want it to write
# output.close() # do not close, the outer function does this
Do not close the file handle in the inner function, the context handling with(..) of the outer function does that for you.
The main takeaway for file handling is that things you write(..) to your file are not neccessarily placed there immediately. The filehandler chooses when to actually persist data to your disk, the latests it does that is when it goes out of scope (of the context handler) or when its internal buffer reaches some threshold so it "thinks" it is now prudent to alter to data on your disc. See How often does python flush to a file? for more infos.
I'm new here to StackOverflow, but I have found a LOT of answers on this site. I'm also a programming newbie, so i figured i'd join and finally become part of this community - starting with a question about a problem that's been plaguing me for hours.
I login to a website and scrape a big body of text within the b tag to be converted into a proper table. The layout of the resulting Output.txt looks like this:
BIN STATUS
8FHA9D8H 82HG9F RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
INVENTORY CODE: FPBC *SOUP CANS LENTILS
BIN STATUS
HA8DHW2H HD0138 RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU 00A123 #2956- INVALID STOCK COUPON CODE (MISSING).
93827548 096DBR RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
There are a bunch of pages with the exact same blocks, but i need them to be combined into an ACTUAL table that looks like this:
BIN INV CODE STATUS
HA8DHW2HHD0138 FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU00A123 FPBC-*SOUP CANS LENTILS #2956- INVALID STOCK COUPON CODE (MISSING).
93827548096DBR FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8FHA9D8H82HG9F SSXR-98-20LM NM CORN CREAM RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
Essentially, all separate text blocks in this example would become part of this table, with the inv code repeating with its Bin values. I would post my attempts at parsing this data(have tried Pandas/bs/openpyxl/csv writer), but ill admit they are a little embarrassing, as i cannot find any information on this specific problem. Is there any benevolent soul out there that can help me out? :)
(Also, i am using Python 2.7)
A simple custom parser like the following should do the trick.
from __future__ import print_function
def parse_body(s):
line_sep = '\n'
getting_bins = False
inv_code = ''
for l in s.split(line_sep):
if l.startswith('INVENTORY CODE:') and not getting_bins:
inv_data = l.split()
inv_code = inv_data[2] + '-' + ' '.join(inv_data[3:])
elif l.startswith('INVENTORY CODE:') and getting_bins:
print("unexpected inventory code while reading bins:", l)
elif l.startswith('BIN') and l.endswith('MESSAGE'):
getting_bins = True
elif getting_bins == True and l:
bin_data = l.split()
# need to add exception handling here to make sure:
# 1) we have an inv_code
# 2) bin_data is at least 3 items big (assuming two for
# bin_id and at least one for message)
# 3) maybe some constraint checking to ensure that we have
# a valid instance of an inventory code and bin id
bin_id = ''.join(bin_data[0:2])
message = ' '.join(bin_data[2:])
# we now have a bin, an inv_code, and a message to add to our table
print(bin_id.ljust(20), inv_code.ljust(30), message, sep='\t')
elif getting_bins == True and not l:
# done getting bins for current inventory code
getting_bins = False
inv_code = ''
A rather complex one, but this might get you started:
import re, pandas as pd
from pandas import DataFrame
rx = re.compile(r'''
(?:INVENTORY\ CODE:)\s*
(?P<inv>.+\S)
[\s\S]+?
^BIN.+[\n\r]
(?P<bin_msg>(?:(?!^\ ).+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)
string = your_string_here
# set up the dataframe
df = DataFrame(columns = ['BIN', 'INV', 'MESSAGE'])
for match in rx.finditer(string):
inv = match.group('inv')
bin_msg_raw = match.group('bin_msg').split("\n")
rxbinmsg = re.compile(r'^(?P<bin>(?:(?!\ {2}).)+)\s+(?P<message>.+\S)\s*$', re.MULTILINE)
for item in bin_msg_raw:
for m in rxbinmsg.finditer(item):
# append it to the dataframe
df.loc[len(df.index)] = [m.group('bin'), inv, m.group('message')]
print(df)
Explanation
It looks for INVENTORY CODE and sets up the groups (inv and bin_msg) for further processing in afterwork() (note: it would be easier if you had only one line of bin/msg as you need to split the group here afterwards).
Afterwards, it splits the bin and msg part and appends all to the df object.
I had a code written for a website scrapping which may help you.
Basically what you need to do is write click on the web page go to html and try to find the tag for the table you are looking for and using the module (i am using beautiful soup) extract the information. I am creating a json as I need to store it into mongodb you can create table.
#! /usr/bin/python
import sys
import requests
import re
from BeautifulSoup import BeautifulSoup
import pymongo
def req_and_parsing():
url2 = 'http://businfo.dimts.in/businfo/Bus_info/EtaByRoute.aspx?ID='
list1 = ['534UP','534DOWN']
for Route in list1:
final_url = url2 + Route
#r = requests.get(final_url)
#parsing_file(r.text,Route)
outdict = []
outdict = [parsing_file( requests.get(url2+Route).text,Route) for Route in list1 ]
print outdict
conn = f_connection()
for i in range(len(outdict)):
insert_records(conn,outdict[i])
def parsing_file(txt,Route):
soup = BeautifulSoup(txt)
table = soup.findAll("table",{"id" : "ctl00_ContentPlaceHolder1_GridView2"})
#trtags = table[0].findAll('tr')
tdlist = []
trtddict = {}
"""
for trtag in trtags:
print 'print trtag- ' , trtag.text
tdtags = trtag.findAll('td')
for tdtag in tdtags:
print tdtag.text
"""
divtags = soup.findAll("span",{"id":"ctl00_ContentPlaceHolder1_ErrorLabel"})
for divtag in divtags:
for divtag in divtags:
print "div tag - " , divtag.text
if divtag.text == "Currently no bus is running on this route" or "This is not a cluster (orange bus) route":
print "Page not displayed Errored with below meeeage for Route-", Route," , " , divtag.text
sys.exit()
trtags = table[0].findAll('tr')
for trtag in trtags:
tdtags = trtag.findAll('td')
if len(tdtags) == 2:
trtddict[tdtags[0].text] = sub_colon(tdtags[1].text)
return trtddict
def sub_colon(tag_str):
return re.sub(';',',',tag_str)
def f_connection():
try:
conn=pymongo.MongoClient()
print "Connected successfully!!!"
except pymongo.errors.ConnectionFailure, e:
print "Could not connect to MongoDB: %s" % e
return conn
def insert_records(conn,stop_dict):
db = conn.test
print db.collection_names()
mycoll = db.stopsETA
mycoll.insert(stop_dict)
if __name__ == "__main__":
req_and_parsing()
Background information: I'm looking to pull data from ratemyprofessor.com - I have limited programming experience so I decided to see if something was pre-built to accomplish this task.
I came across this here: https://classic.scraperwiki.com/scrapers/ratemyprofessors/
Which is exactly what I'm looking for. ScraperWiki closed down but has it setup to transfer everything to Morph.io - Which I did here: https://morph.io/reddyfire/ratemyprofessors
My problem: It doesn't work. It should be outputting a database that gives me information I've identified as needing. I'm assuming it has something to do with the URL it's pulling from:
response = scraperwiki.scrape("http://www.ratemyprofessors.com/SelectTeacher.jsp?sid=%s&pageNo=%s" % (sid,str(i)))
But I have no idea if that's right. I'm feeling pretty defeated by this but I want to keep going for a solution.
What I need: I'm looking to get the **Name, Department,Total Ratings,Overall Quality, Easiness, and Hotness rating for each instructor at the colleges. Here's some sample output in the desired format:
{"953":("Stanford",32),"799":("Rice",17),"780":("Princeton",16)}
I tested to do a simplified scraper for you. Do note that it isnt pythonic (ie not beautiful or fast), but as a starting point it works.
__author__ = 'Victor'
import urllib
import re
url = 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=306975'
def crawlURL(addedURL):
url = addedURL
html = urllib.urlopen(url).read()
teacherData = re.findall(r'\">(.*?)</',html)
output = ''
addStuff = 0
for x in xrange(len(teacherData)):
if teacherData[x] == 'Submit a Correction':
output = 'professor: '
for y in xrange(4):
output += teacherData[x-8+y] + ' '
addStuff = 1
elif teacherData[x] == 'Helpfulness' and addStuff == 1:
output += ': Overall quality: '+ str(teacherData[x-2]) + ': Average grade: ' + str(teacherData[x-1]) + ': Helpfulness: ' + teacherData[x+1]
elif teacherData[x] == 'Easiness' and addStuff == 1:
output += ': Easiness: ' + str(teacherData[x+1])
addStuff = 0
break
print output
crawlURL(url)
It renders this output:
Dr. Kimora John Jay College : Overall quality: 5.0: Average grade:
A: Helpfulness: 5.0 : Easiness: 4.6
There is plenty of room for improvement, but this is as close to pseudcode i could get.
In this example it is a function that prints the output, if you want to add it to a list just add a "return output" in the end and call the function with "listName.append(crawlURL(url))"
This is for Python 2.7
And yes, it doesnt get the exact data you requested. It just opens the door for you ;)
EDIT:
Here is an example on how to loop the requests
def crawlURL(addesURL):
...
return output
baseURL = 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=306'
for x in xrange(50):
url = baseURL + str(x+110)
if crawlURL(url) != '': print crawlURL(url)
If you are iterating through all of their data you should consider to add delays every now and then so you dont accidentally DDoS them.
I'm new to this site and new to Python--as in only a few days into a course. At work, I have inherited a good sized project that involves matching 9 digit zip codes in an excel file to their US congressional districts (from a website). I've noticed through investigation of the code (what little I know) is that the author might be using a website that only allows 5 digit zip codes, not 9 digits. Since some districts share zip codes, 9 digit codes are more precise. Here's the code I'm working with:
import urllib
import re
import csv
import datetime
print datetime.datetime.now()
INPUT_FILE_NAME = 'zip1.csv'
OUTPUT_FILE_NAME = 'legislator_output_%s-%0*d%0*d.csv' % ((datetime.date.today(), 2, datetime.datetime.now().hour, 2, datetime.datetime.now().minute))
print 'file name:', OUTPUT_FILE_NAME
input_file_handler = open(INPUT_FILE_NAME, 'rb')
input_reader = csv.reader(input_file_handler)
output_file_handler = open(OUTPUT_FILE_NAME, 'wb', 1)
output_writer = csv.writer(output_file_handler)
output_writer.writerow(['unique id', 'zip', 'plus 4', 'member url', 'member name', 'member district'])
fail_list = []
counter = 0
for input_line in input_reader:
zip_entry = '%s-%s' % (input_line[1], input_line[2])
unique_id = input_line[0]
counter += 1
#if counter > 25: continue
zip_part = zip_entry.split('-')[0]
plus_four_part = zip_entry.split('-')[1]
params = urllib.urlencode({'ZIP':zip_part, '%2B4':plus_four_part})
f = urllib.urlopen('http://www.house.gov/htbin/zipfind', params)
page_source = f.read()
#print page_source
relevant_section = re.findall(r'templateLanding(.*?)contentMain', page_source, re.DOTALL)
rep_info = re.findall('(.*?)', relevant_section[0])
rep_district_info = re.findall('is located in (.*?)\.', relevant_section[0])
try:
member_url = rep_info[0][0]
member_name = rep_info[0][1]
member_district = rep_district_info[0]
#member_district = rep_info[0][2]
except:
fail_list += [zip_entry]
member_url = ''
member_name = ''
member_district = ''
row_to_write = [unique_id, zip_part, plus_four_part, member_url, member_name, member_district, datetime.datetime.now()]
output_writer.writerow(row_to_write)
if counter % 50 == 0:
print counter, row_to_write
output_file_handler.close() print OUTPUT_FILE_NAME, 'closed at', datetime.datetime.now()
print len(fail_list), 'entries failed to lookup'
print counter, 'rows done at', datetime.datetime.now()
So, the author used site which only allows for five digits (the code is a couple of years old as is this site). I have no idea how to replace it correctly on a new site.
If anyone knows of a solution or can point me in the direction of resources that might help, I would much appreciate it. At the moment I'm lost!
For what I can see, you can query, for example, http://www.house.gov/htbin/findrep?ZIP=63333-1211
So you could replace the urllib call for
urllib.urlopen('http://www.house.gov/htbin/findrep', zip_entry)