I'm new to this site and new to Python--as in only a few days into a course. At work, I have inherited a good sized project that involves matching 9 digit zip codes in an excel file to their US congressional districts (from a website). I've noticed through investigation of the code (what little I know) is that the author might be using a website that only allows 5 digit zip codes, not 9 digits. Since some districts share zip codes, 9 digit codes are more precise. Here's the code I'm working with:
import urllib
import re
import csv
import datetime
print datetime.datetime.now()
INPUT_FILE_NAME = 'zip1.csv'
OUTPUT_FILE_NAME = 'legislator_output_%s-%0*d%0*d.csv' % ((datetime.date.today(), 2, datetime.datetime.now().hour, 2, datetime.datetime.now().minute))
print 'file name:', OUTPUT_FILE_NAME
input_file_handler = open(INPUT_FILE_NAME, 'rb')
input_reader = csv.reader(input_file_handler)
output_file_handler = open(OUTPUT_FILE_NAME, 'wb', 1)
output_writer = csv.writer(output_file_handler)
output_writer.writerow(['unique id', 'zip', 'plus 4', 'member url', 'member name', 'member district'])
fail_list = []
counter = 0
for input_line in input_reader:
zip_entry = '%s-%s' % (input_line[1], input_line[2])
unique_id = input_line[0]
counter += 1
#if counter > 25: continue
zip_part = zip_entry.split('-')[0]
plus_four_part = zip_entry.split('-')[1]
params = urllib.urlencode({'ZIP':zip_part, '%2B4':plus_four_part})
f = urllib.urlopen('http://www.house.gov/htbin/zipfind', params)
page_source = f.read()
#print page_source
relevant_section = re.findall(r'templateLanding(.*?)contentMain', page_source, re.DOTALL)
rep_info = re.findall('(.*?)', relevant_section[0])
rep_district_info = re.findall('is located in (.*?)\.', relevant_section[0])
try:
member_url = rep_info[0][0]
member_name = rep_info[0][1]
member_district = rep_district_info[0]
#member_district = rep_info[0][2]
except:
fail_list += [zip_entry]
member_url = ''
member_name = ''
member_district = ''
row_to_write = [unique_id, zip_part, plus_four_part, member_url, member_name, member_district, datetime.datetime.now()]
output_writer.writerow(row_to_write)
if counter % 50 == 0:
print counter, row_to_write
output_file_handler.close() print OUTPUT_FILE_NAME, 'closed at', datetime.datetime.now()
print len(fail_list), 'entries failed to lookup'
print counter, 'rows done at', datetime.datetime.now()
So, the author used site which only allows for five digits (the code is a couple of years old as is this site). I have no idea how to replace it correctly on a new site.
If anyone knows of a solution or can point me in the direction of resources that might help, I would much appreciate it. At the moment I'm lost!
For what I can see, you can query, for example, http://www.house.gov/htbin/findrep?ZIP=63333-1211
So you could replace the urllib call for
urllib.urlopen('http://www.house.gov/htbin/findrep', zip_entry)
Related
I am working on Stock predicting project.I want to download historical data from yahoo finance and save them in CSV format.
Since I am beginner in Python I am unable to correct the error.
My code is as follows:
import re
import urllib2
import calendar
import datetime
import getopt
import sys
import time
crumble_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
cookie_regex = r'Set-Cookie: (.*?); '
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{}?period1={}&period2={}&interval=1d&events=history&crumb={}'
def get_crumble_and_cookie(symbol):
link = crumble_link.format(symbol)
response = urllib2.urlopen(link)
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
text = response.read()
match = re.search(crumble_regex, text)
crumble_str = match.group(1)
return crumble_str, cookie_str
def download_quote(symbol, date_from, date_to):
time_stamp_from = calendar.timegm(datetime.datetime.strptime(date_from, "%Y-%m-%d").timetuple())
time_stamp_to = calendar.timegm(datetime.datetime.strptime(date_to, "%Y-%m-%d").timetuple())
attempts = 0
while attempts < 5:
crumble_str, cookie_str = get_crumble_and_cookie(symbol)
link = quote_link.format(symbol, time_stamp_from, time_stamp_to, crumble_str)
#print link
r = urllib2.Request(link, headers={'Cookie': cookie_str})
try:
response = urllib2.urlopen(r)
text = response.read()
print "{} downloaded".format(symbol)
return text
except urllib2.URLError:
print "{} failed at attempt # {}".format(symbol, attempts)
attempts += 1
time.sleep(2*attempts)
return ""
if __name__ == '__main__':
print get_crumble_and_cookie('KO')
from_arg = "from"
to_arg = "to"
symbol_arg = "symbol"
output_arg = "o"
opt_list = (from_arg+"=", to_arg+"=", symbol_arg+"=")
try:
options, args = getopt.getopt(sys.argv[1:],output_arg+":",opt_list)
except getopt.GetoptError as err:
print err
for opt, value in options:
if opt[2:] == from_arg:
from_val = value
elif opt[2:] == to_arg:
to_val = value
elif opt[2:] == symbol_arg:
symbol_val = value
elif opt[1:] == output_arg:
output_val = value
print "downloading {}".format(symbol_val)
text = download_quote(symbol_val, from_val, to_val)
with open(output_val, 'wb') as f:
f.write(text)
print "{} written to {}".format(symbol_val, output_val)
And the Error message that I am getting is :
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
49, in <module>
print get_crumble_and_cookie('KO')
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
19, in get_crumble_and_cookie
cookie_str = match.group(1)
AttributeError: 'NoneType' object has no attribute 'group'
So how can we resolve this problem that has popped up?
Look at these two commands:
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
The first one takes the string response.info() does a regular expression search to match cookie_regex. Then match.group(1) is supposed to take the match from it. The problem however is that if you do a print match in between these commands, you'll see that the re.search() returned nothing. This means match.group() has nothing to "group", which is why it errors out.
If you take a closer look at response.info() (you could just add a print response.info() command in your script to see it), you'll see that there's a line in response code that starts with "set-cookie:", the code after which you're trying to capture. However, you have your cookie_regex string set to look for a line with "Set-Cookie:". Note the capital letters. When I change that string to all lower-case, the error goes away:
cookie_regex = r'set-cookie: (.*?); '
I did run into another error after that, where print "downloading {}".format(symbol_val) stops because symbol_val hasn't been defined. It seems that this variable is only declared and assigned when opt[2:] == symbol_arg:. So you may want to rewrite that part to cover all cases.
I'm new here to StackOverflow, but I have found a LOT of answers on this site. I'm also a programming newbie, so i figured i'd join and finally become part of this community - starting with a question about a problem that's been plaguing me for hours.
I login to a website and scrape a big body of text within the b tag to be converted into a proper table. The layout of the resulting Output.txt looks like this:
BIN STATUS
8FHA9D8H 82HG9F RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
INVENTORY CODE: FPBC *SOUP CANS LENTILS
BIN STATUS
HA8DHW2H HD0138 RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU 00A123 #2956- INVALID STOCK COUPON CODE (MISSING).
93827548 096DBR RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
There are a bunch of pages with the exact same blocks, but i need them to be combined into an ACTUAL table that looks like this:
BIN INV CODE STATUS
HA8DHW2HHD0138 FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU00A123 FPBC-*SOUP CANS LENTILS #2956- INVALID STOCK COUPON CODE (MISSING).
93827548096DBR FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8FHA9D8H82HG9F SSXR-98-20LM NM CORN CREAM RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
Essentially, all separate text blocks in this example would become part of this table, with the inv code repeating with its Bin values. I would post my attempts at parsing this data(have tried Pandas/bs/openpyxl/csv writer), but ill admit they are a little embarrassing, as i cannot find any information on this specific problem. Is there any benevolent soul out there that can help me out? :)
(Also, i am using Python 2.7)
A simple custom parser like the following should do the trick.
from __future__ import print_function
def parse_body(s):
line_sep = '\n'
getting_bins = False
inv_code = ''
for l in s.split(line_sep):
if l.startswith('INVENTORY CODE:') and not getting_bins:
inv_data = l.split()
inv_code = inv_data[2] + '-' + ' '.join(inv_data[3:])
elif l.startswith('INVENTORY CODE:') and getting_bins:
print("unexpected inventory code while reading bins:", l)
elif l.startswith('BIN') and l.endswith('MESSAGE'):
getting_bins = True
elif getting_bins == True and l:
bin_data = l.split()
# need to add exception handling here to make sure:
# 1) we have an inv_code
# 2) bin_data is at least 3 items big (assuming two for
# bin_id and at least one for message)
# 3) maybe some constraint checking to ensure that we have
# a valid instance of an inventory code and bin id
bin_id = ''.join(bin_data[0:2])
message = ' '.join(bin_data[2:])
# we now have a bin, an inv_code, and a message to add to our table
print(bin_id.ljust(20), inv_code.ljust(30), message, sep='\t')
elif getting_bins == True and not l:
# done getting bins for current inventory code
getting_bins = False
inv_code = ''
A rather complex one, but this might get you started:
import re, pandas as pd
from pandas import DataFrame
rx = re.compile(r'''
(?:INVENTORY\ CODE:)\s*
(?P<inv>.+\S)
[\s\S]+?
^BIN.+[\n\r]
(?P<bin_msg>(?:(?!^\ ).+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)
string = your_string_here
# set up the dataframe
df = DataFrame(columns = ['BIN', 'INV', 'MESSAGE'])
for match in rx.finditer(string):
inv = match.group('inv')
bin_msg_raw = match.group('bin_msg').split("\n")
rxbinmsg = re.compile(r'^(?P<bin>(?:(?!\ {2}).)+)\s+(?P<message>.+\S)\s*$', re.MULTILINE)
for item in bin_msg_raw:
for m in rxbinmsg.finditer(item):
# append it to the dataframe
df.loc[len(df.index)] = [m.group('bin'), inv, m.group('message')]
print(df)
Explanation
It looks for INVENTORY CODE and sets up the groups (inv and bin_msg) for further processing in afterwork() (note: it would be easier if you had only one line of bin/msg as you need to split the group here afterwards).
Afterwards, it splits the bin and msg part and appends all to the df object.
I had a code written for a website scrapping which may help you.
Basically what you need to do is write click on the web page go to html and try to find the tag for the table you are looking for and using the module (i am using beautiful soup) extract the information. I am creating a json as I need to store it into mongodb you can create table.
#! /usr/bin/python
import sys
import requests
import re
from BeautifulSoup import BeautifulSoup
import pymongo
def req_and_parsing():
url2 = 'http://businfo.dimts.in/businfo/Bus_info/EtaByRoute.aspx?ID='
list1 = ['534UP','534DOWN']
for Route in list1:
final_url = url2 + Route
#r = requests.get(final_url)
#parsing_file(r.text,Route)
outdict = []
outdict = [parsing_file( requests.get(url2+Route).text,Route) for Route in list1 ]
print outdict
conn = f_connection()
for i in range(len(outdict)):
insert_records(conn,outdict[i])
def parsing_file(txt,Route):
soup = BeautifulSoup(txt)
table = soup.findAll("table",{"id" : "ctl00_ContentPlaceHolder1_GridView2"})
#trtags = table[0].findAll('tr')
tdlist = []
trtddict = {}
"""
for trtag in trtags:
print 'print trtag- ' , trtag.text
tdtags = trtag.findAll('td')
for tdtag in tdtags:
print tdtag.text
"""
divtags = soup.findAll("span",{"id":"ctl00_ContentPlaceHolder1_ErrorLabel"})
for divtag in divtags:
for divtag in divtags:
print "div tag - " , divtag.text
if divtag.text == "Currently no bus is running on this route" or "This is not a cluster (orange bus) route":
print "Page not displayed Errored with below meeeage for Route-", Route," , " , divtag.text
sys.exit()
trtags = table[0].findAll('tr')
for trtag in trtags:
tdtags = trtag.findAll('td')
if len(tdtags) == 2:
trtddict[tdtags[0].text] = sub_colon(tdtags[1].text)
return trtddict
def sub_colon(tag_str):
return re.sub(';',',',tag_str)
def f_connection():
try:
conn=pymongo.MongoClient()
print "Connected successfully!!!"
except pymongo.errors.ConnectionFailure, e:
print "Could not connect to MongoDB: %s" % e
return conn
def insert_records(conn,stop_dict):
db = conn.test
print db.collection_names()
mycoll = db.stopsETA
mycoll.insert(stop_dict)
if __name__ == "__main__":
req_and_parsing()
Sorry - My questions is how can I change a file object within a function from a different function?
I've been trying to work out this error in my first python script for too long now, Dr Google and the forums aren't helping me too much, but I'm hoping you can.
I have a looping function that generates alot of data and I would like to output it to a text file, and create a new text file after the third loop.
I have 2 functions defined, one to create the data hashes, the other to create the new files.
The new files are being created as expected (aaa.txt, baa.txt...etc) but the "hashit" function only ever writes to the first file (aaa.txt) even though the others are being created.
I have tried fo.close() fo.flush(), as well as referencing fo in the functions but can't seem to make it work. Also I've moved the fo.write from the function to the main body.
I have included a cut down version of the code that I've been using to troubleshoot this issue, the real one has several more loops increasing the string length.
Thanks in advance
import smbpasswd, hashlib
base = '''abcdefghijklmnopqrstuvwxyz '''
# base length 95
print(base)
baselen = len(base)
name = 'aaa.txt'
fo = open(name, "w")
print "Name of the file: ", fo.name
print "Closed or not : ", fo.closed
print "Opening mode : ", fo.mode
print "Softspace flag : ", fo.softspace
pw01 = 0
pw02 = 0
pw03 = 0
def hashit(passwd):
#2
# Need to install module
# sudo apt-get install python-smbpasswd
hex_dig_lm = smbpasswd.lmhash(passwd)
hex_dig_ntlm = smbpasswd.nthash(passwd)
#print '%s:%s' % smbpasswd.hash(passwd)
hash_md5 = hashlib.md5(passwd)
hex_dig_md5 = hash_md5.hexdigest()
print(passwd)
print(hex_dig_lm)
print(hex_dig_ntlm)
print(hex_dig_md5)
hashstring = passwd +","+ hex_dig_lm +","+ hex_dig_md5 + '\n'
fo.write(hashstring);
def newfile(name):
fo.flush()
fo = open(name, "a")
print("-------newfile------")
print "Name of the file: ", fo.name
print "Closed or not : ", fo.closed
print('NewFile : ' + name)
raw_input("\n\nPress the enter key to exit.")
# add 3rd digit
while (pw03 < baselen):
pwc03 = base[pw03]
name = pwc03 + 'aa.txt'
fo.close
newfile(name);
pw03 += 1
while (pw02 < baselen):
pwc02 = base[pw02]
pw02 += 1
while (pw01 < baselen):
pwc01 = base[pw01]
pw01 += 1
passwd = pwc03 + pwc02 + pwc01
hashit(passwd);
else:
pw01 = 0
else:
pw02 = 0
else:
pw03 = 0
In your newfile() function, add this line first:
global fo
I'm trying to grab the most recently uploaded videos. There's a standard feed for that - it's called most_recent. I don't have any problems grabbing the feed, but when I look at the entries inside, they're all half a year old, which is hardly recent.
Here's the code I'm using:
import requests
import os.path as P
import sys
from lxml import etree
import datetime
namespaces = {"a": "http://www.w3.org/2005/Atom", "yt": "http://gdata.youtube.com/schemas/2007"}
fmt = "%Y-%m-%dT%H:%M:%S.000Z"
class VideoEntry:
"""Data holder for the video."""
def __init__(self, node):
self.entry_id = node.find("./a:id", namespaces=namespaces).text
published = node.find("./a:published", namespaces=namespaces).text
self.published = datetime.datetime.strptime(published, fmt)
def __str__(self):
return "VideoEntry[id='%s']" % self.entry_id
def paginate(xml):
root = etree.fromstring(xml)
next_page = root.find("./a:link[#rel='next']", namespaces=namespaces)
if next_page == None:
next_link = None
else:
next_link = next_page.get("href")
entries = [VideoEntry(e) for e in root.xpath("/a:feed/a:entry", namespaces=namespaces)]
return entries, next_link
prefix = "https://gdata.youtube.com/feeds/api/standardfeeds/"
standard_feeds = set("top_rated top_favorites most_shared most_popular most_recent most_discussed most_responded recently_featured on_the_web most_viewed".split(" "))
feed_name = sys.argv[1]
assert feed_name in standard_feeds
feed_url = prefix + feed_name
all_video_ids = []
while feed_url is not None:
r = requests.get(feed_url)
if r.status_code != 200:
break
text = r.text.encode("utf-8")
video_ids, feed_url = paginate(text)
all_video_ids += video_ids
all_upload_times = [e.published for e in all_video_ids]
print min(all_upload_times), max(all_upload_times)
As you can see, it prints the min and max timestamps for the entire feed.
misha#misha-antec$ python get_standard_feed.py most_recent
2013-02-02 14:40:02 2013-02-02 14:54:00
misha#misha-antec$ python get_standard_feed.py top_rated
2006-04-06 21:30:53 2013-07-28 22:22:38
I've glanced through the downloaded XML and it appears to match the output. Am I doing something wrong?
Also, on an unrelated note, the feeds I'm getting are all about 100 entries (I'm paginating through them 25 at a time). Is this normal? I expected the feeds to be a bit bigger.
Regarding the "Most-Recent-Feed"-Topic: There is a ticket for this one here. Unfortunately, the YouTube-API-Teams doesn't respond or solved the problem so far.
Regarding the number of entries: That depends on the type of standardfeed, but for the most-recent-Feed it´s usually around 100.
Note: You could try using the "orderby=published" parameter to get recents videos, although I don´t know how "recent" they are.
https://gdata.youtube.com/feeds/api/videos?orderby=published&prettyprint=True
You can combine this query with the "category"-parameter or other ones (region-specific queries - like for the standard feeds - are not possible, afaik).
#martineau I have updated my codes, is this what you meant ? How do i handle KeyError instead of NameError ?
url = "http://app2.nea.gov.sg/anti-pollution-radiation-protection/air-pollution/psi/psi-readings-over-the-last-24-hours"
web_soup = soup(urllib2.urlopen(url))
table = web_soup.find(name="div", attrs={'class': 'c1'}).find_all(name="div")[4].find_all('table')[0]
data = {}
cur_time = datetime.datetime.strptime("12AM", "%I%p")
for tr_index, tr in enumerate(table.find_all('tr')):
if 'Time' in tr.text:
continue
for td_index, td in enumerate(tr.find_all('td')):
if not td_index:
continue
data[cur_time] = td.text.strip()
if td.find('strong'):
bold_time = cur_time
data[bold_time] = '20'
cur_time += datetime.timedelta(hours=1)
default_value = '20' # whatever you want it to be
try:
bold = data[bold_time]
except NameError:
bold_time = beforebold = beforebeforebold = default_value
# might want to set "bold" to something, too, if needed
else:
beforebold = data.get(bold_time - datetime.timedelta(hours=1))
beforebeforebold = data.get(bold_time - datetime.timedelta(hours=2))
This is where I print my data to do calculation.
print bold
print beforebold
print beforebeforebold
You need to add something to set data[bold_time]:
if td.find('strong'):
bold_time = cur_time
data[bold_time] = ????? # whatever it should be
cur_time += datetime.timedelta(hours=1)
This should avoid both the NameError and KeyError exceptions as long as the word strong is found. You still might want to code defensively and handle one or both of them gracefully. That what exception where meant to do, handle those exceptional cases that shouldn't happen...
I had read your previous post before it disappeared, and then I've read this one.
I find it a pity to use BeautifulSoup for your goal, because, from the code I see, I find its use complicated, and the fact is that regexes run roughly 10 times faster than BeautifulSoup.
Here's the code with only re, that furnishes the data you are interested in.
I know, there will people to say that HTML text can't be parsed by regexs. I know, I know... but I don't parse the text, I directly find the chunks of text that are interesting. The source code of the webpage of this site is apparently very well structured and it seems there is little risk of bugs. Moreover, tests and verification can be added to keep watch on the source code and to be instantly informed on the possible changings made by the webmaster in the webpage
import re
from httplib import HTTPConnection
hypr = HTTPConnection(host='app2.nea.gov.sg',
timeout = 300)
rekete = ('/anti-pollution-radiation-protection/'
'air-pollution/psi/'
'psi-readings-over-the-last-24-hours')
hypr.request('GET',rekete)
page = hypr.getresponse().read()
patime = ('PSI Readings.+?'
'width="\d+%" align="center">\r\n'
' *<strong>Time</strong>\r\n'
' *</td>\r\n'
'((?: *<td width="\d+%" align="center">'
'<strong>\d+AM</strong>\r\n'
' *</td>\r\n)+.+?)'
'width="\d+%" align="center">\r\n'
' *<strong>Time</strong>\r\n'
' *</td>\r\n'
'((?: *<td width="\d+%" align="center">'
'<strong>\d+PM</strong>\r\n'
' *</td>\r\n)+.+?)'
'PM2.5 Concentration')
rgxtime = re.compile(patime,re.DOTALL)
patline = ('<td align="center">\r\n'
' *<strong>' # next line = group 1
'(North|South|East|West|Central|Overall Singapore)'
'</strong>\r\n'
' *</td>\r\n'
'((?: *<td align="center">\r\n' # group 2 start
' *[.\d-]+\r\n' #
' *</td>\r\n)*)' # group 2 end
' *<td align="center">\r\n'
' *<strong style[^>]+>'
'([.\d-]+)' # group 3
'</strong>\r\n'
' *</td>\r\n')
rgxline = re.compile(patline)
rgxnb = re.compile('<td align="center">\r\n'
' *([.\d-]+)\r\n'
' *</td>\r\n')
m= rgxtime.search(page)
a,b = m.span(1) # m.group(1) contains the data AM
d = dict((mat.group(1),
rgxnb.findall(mat.group(2))+[mat.group(3)])
for mat in rgxline.finditer(page[a:b]))
a,b = m.span(2) # m.group(2) contains the data PM
for mat in rgxline.finditer(page[a:b]):
d[mat.group(1)].extend(rgxnb.findall(mat.group(2))+[mat.group(3)])
print 'last 3 values'
for k,v in d.iteritems():
print '%s : %s' % (k,v[-3:])