Using re.sub and getting error - python

I'm trying to use re.sub on a url but when I do I get a error about
expected string or buffer
This is the code:
elif used_prefix and cmd == "cats" and self.getAccess(user) >=1 and len(args) == 0:
try:
url = "http://catfacts-api.appspot.com/api/facts"
f = urllib.request.urlopen(url)
data = json.loads(f.readall().decode("utf-8"))["facts"]
data = re.sub(r'\<.*?\>',"",data).replace("\\","")
room.message("Random Cat Fact: %s" % data)
except:
room.message((str(sys.exc_info()[1])))
print(traceback.format_exc())
It's suppose to remove the [" around the words "] and to remove the "\"aroundwords"\"

json.loads(f.readall().decode("utf-8"))["facts"] is a list that containing a string.
Replace following line:
data = json.loads(f.readall().decode("utf-8"))["facts"]
with:
data = json.loads(f.readall().decode("utf-8"))["facts"][0]
BTW, you don't need to escape <, >.

Related

How to escape all HTML entities in show_popup() method and fix Parse Error in Sublime Text 3 plugin?

I am making a plugin for Sublime Text 3. It contacts my server in Java and receives a response in the form of a list of strings, that contains C code.
To display this code in a popup window you need to pass a string in HTML format to the method show_popup. Accordingly, all C-code characters that can be recognized by the parser as HTML entities should be replaced with their names (&name;) or numbers (&#number;). At first, I just replaced the most common characters with replace(), but it didn't always work out - Parse Error was displayed in the console:
Parse Error: <br> printf ("Decimals: %d %ld\n", 1977, 650000L);
<br> printf ("Preceding with blanks:&nbs
...
y</a></li><p><b>____________________________________________________</b></p>
</ul>
</body>
code: Unexpected character
I've tried to escape html entities with Python's html library:
import html
...
html.escape(string)
But Sublime doesn't see import and print in console that I was using a function without defining it - I guess he didn't see that I connected this library(Whyyy?). cgi.escape - is depricated, so I can't use this. I decided to write the function myself.
Then I saw a very interesting way to replace all the characters whose code is >127 and some other characters (&, <,>) with their numbers:
def escape_html (s):
out = ""
i = 0
while i < len(s):
c = s[i]
number = ord(c)
if number > 127 or c == '"' or c == '\'' or c == '<' or c == '>' or c == '&':
out += "&#"
out += str(number)
out += ";"
else:
out += c
i += 1
out = out.replace(" ", " ")
out = out.replace("\n", "<br>")
return out
This code works perfectly for displaying characters in a browser, but unfortunately it is not supported by Sublime Text 3.
As a result, I came to the conclusion that these characters should be replaced with their equivalent names:
def dumb_escape_html(s):
entities = [["&", "&"], ["<", "<"], [">", ">"], ["\n", "<br>"],
[" ", " "]]
for entity in entities:
s = s.replace(entity[0], entity[1])
return s
But again I faced an obstacle: not all names are supported in Sublime. And again an error Parse Error.
I also attach a link to JSON file, which contains answer from my server, content of which should be displayed in pop-up window: Example of data from sever (codeshare.io)
I absolutely do not understand, in what I make a mistake - I hope, that great programmers know how to solve my problem.
Edit. Minimal, Reproducible Example:
import sublime
import sublime_plugin
import string
import sys
import json
def get_func_name(line, column):
return "printf"
def get_const_data(func_name):
input_file = open ("PATH_TO_JSON/data_printf.json")
results = json.load(input_file)
return results
def dumb_escape_html(s):
entities = [["&", "&"], ["<", "<"], [">", ">"], ["\n", "<br>"],
[" ", " "]]
for entity in entities:
s = s.replace(entity[0], entity[1])
return s
def dumb_unescape_html(s):
entities = [["<", "<"], [">", ">"], ["<br>", "\n"],
[" ", " "], ["&", "&"]]
for entity in entities:
s = s.replace(entity[0], entity[1])
return s
class CoderecsysCommand(sublime_plugin.TextCommand):
def run(self, edit):
v = self.view
cur_line = v.substr(v.line(v.sel()[0]))
for sel in v.sel():
line_begin = v.rowcol(sel.begin())[0]
line_end = v.rowcol(sel.end())[0]
pos = v.rowcol(v.sel()[0].begin()) # (row, column)
try:
func_name = get_func_name(cur_line, pos[1]-1)
li_tree = ""
final_data = get_const_data(func_name)
for i in range(len(final_data)):
source = "source: " + final_data[i]["source"]
escaped = dumb_escape_html(final_data[i]["code"])
divider = "<b>____________________________________________________</b>"
li_tree += "<li><p>%s</p>%s <a href='%s'>Copy</a></li><p>%s</p>" %(source, escaped, escaped, divider)
# The html to be shown.
html = """
<body id=copy-multiline>
Examples of using <b>%s</b> function.
<ul>
%s
</ul>
</body>
""" %(func_name, li_tree)
self.view.show_popup(html, max_width=700, on_navigate=lambda example: self.copy_example(example, func_name, source))
except Exception as ex:
self.view.show_popup("<b style=\"color:#1c87c9\">CodeRec Error:</b> " + str(ex), max_width=700)
def copy_example(self, example, func_name, source):
# Copies the code to the clipboard.
unescaped = dumb_unescape_html(example)
unescaped = "// " + source + unescaped
sublime.set_clipboard(unescaped)
self.view.hide_popup()
sublime.status_message('Example of using ' + func_name + ' copied to clipboard !')

cookie_str = match.group(1).AttributeError: 'NoneType' object has no attribute 'group'

I am working on Stock predicting project.I want to download historical data from yahoo finance and save them in CSV format.
Since I am beginner in Python I am unable to correct the error.
My code is as follows:
import re
import urllib2
import calendar
import datetime
import getopt
import sys
import time
crumble_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
cookie_regex = r'Set-Cookie: (.*?); '
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{}?period1={}&period2={}&interval=1d&events=history&crumb={}'
def get_crumble_and_cookie(symbol):
link = crumble_link.format(symbol)
response = urllib2.urlopen(link)
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
text = response.read()
match = re.search(crumble_regex, text)
crumble_str = match.group(1)
return crumble_str, cookie_str
def download_quote(symbol, date_from, date_to):
time_stamp_from = calendar.timegm(datetime.datetime.strptime(date_from, "%Y-%m-%d").timetuple())
time_stamp_to = calendar.timegm(datetime.datetime.strptime(date_to, "%Y-%m-%d").timetuple())
attempts = 0
while attempts < 5:
crumble_str, cookie_str = get_crumble_and_cookie(symbol)
link = quote_link.format(symbol, time_stamp_from, time_stamp_to, crumble_str)
#print link
r = urllib2.Request(link, headers={'Cookie': cookie_str})
try:
response = urllib2.urlopen(r)
text = response.read()
print "{} downloaded".format(symbol)
return text
except urllib2.URLError:
print "{} failed at attempt # {}".format(symbol, attempts)
attempts += 1
time.sleep(2*attempts)
return ""
if __name__ == '__main__':
print get_crumble_and_cookie('KO')
from_arg = "from"
to_arg = "to"
symbol_arg = "symbol"
output_arg = "o"
opt_list = (from_arg+"=", to_arg+"=", symbol_arg+"=")
try:
options, args = getopt.getopt(sys.argv[1:],output_arg+":",opt_list)
except getopt.GetoptError as err:
print err
for opt, value in options:
if opt[2:] == from_arg:
from_val = value
elif opt[2:] == to_arg:
to_val = value
elif opt[2:] == symbol_arg:
symbol_val = value
elif opt[1:] == output_arg:
output_val = value
print "downloading {}".format(symbol_val)
text = download_quote(symbol_val, from_val, to_val)
with open(output_val, 'wb') as f:
f.write(text)
print "{} written to {}".format(symbol_val, output_val)
And the Error message that I am getting is :
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
49, in <module>
print get_crumble_and_cookie('KO')
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
19, in get_crumble_and_cookie
cookie_str = match.group(1)
AttributeError: 'NoneType' object has no attribute 'group'
So how can we resolve this problem that has popped up?
Look at these two commands:
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
The first one takes the string response.info() does a regular expression search to match cookie_regex. Then match.group(1) is supposed to take the match from it. The problem however is that if you do a print match in between these commands, you'll see that the re.search() returned nothing. This means match.group() has nothing to "group", which is why it errors out.
If you take a closer look at response.info() (you could just add a print response.info() command in your script to see it), you'll see that there's a line in response code that starts with "set-cookie:", the code after which you're trying to capture. However, you have your cookie_regex string set to look for a line with "Set-Cookie:". Note the capital letters. When I change that string to all lower-case, the error goes away:
cookie_regex = r'set-cookie: (.*?); '
I did run into another error after that, where print "downloading {}".format(symbol_val) stops because symbol_val hasn't been defined. It seems that this variable is only declared and assigned when opt[2:] == symbol_arg:. So you may want to rewrite that part to cover all cases.

unpack requires a string argument of length 24

I am not sure what I am doing wrong here but I am trying to open a file, trace1.flow, read the header information then throw the source IP and destination IP into dictionaries. This is done in Python running on a Fedora VM. I am getting the following error:
(secs, nsecs, booted, exporter, mySourceIP, myDestinationIP) = struct.unpack('IIIIII',myBuf)
struct.error: unpack requires a string argument of length 24
Here is my code:
import struct
import socket
#Dictionaries
uniqSource = {}
uniqDestination = {}
def int2quad(i):
z = struct.pack('!I', i)
return socket.inet_ntoa(z)
myFile = open('trace1.flow')
myBuf = myFile.read(8)
(magic, endian, version, headerLen) = struct.unpack('HBBI', myBuf)
print "Magic: ", hex(magic), "Endian: ", endian, "Version: ", version, "Header Length: ", headerLen
myFile.read(headerLen - 8)
try:
while(True):
myBuf = myFile.read(24)
(secs, nsecs, booted, exporter, mySourceIP, myDestinationIP) = struct.unpack('IIIIII',myBuf)
mySourceIP = int2quad(mySourceIP)
myDestinationIP = int2quad(myDestinationIP)
if mySourceIP not in uniqSource:
uniqSource[mySourceIP] = 1
else:
uniqSource[mySourceIP] += 1
if myDestinationIP not in uniqDestination:
uniqDestination[myDestinationIP] = 1
else:
uniqDestination[myDestinationIP] += 1
myFile.read(40)
except EOFError:
print "END OF FILE"
You seem to assume that file.read will raise EOFError on end of file, but this error is only raised by input() and raw_input(). file.read will simply return a string that's shorter than requested (possibly empty).
So you need to check the length after reading:
myBuf = myFile.read(24)
if len(myBuf) < 24:
break
Perhaps your have reached end-of-file. Check the length of myBuf:
len(myBuf)
It's probably less than 24 chars long. Also you don't need those extra parenthesis, and try to specify duplicated types using 'nI' like this:
secs, nsecs, booted, exporter, mySourceIP, myDestinationIP = struct.unpack('6I',myBuf)

parsing a string in python for #hashtag

I am wondering, how could I make an algorithm that parses a string for the hashtag symbol ' # ' and returns the full string, but where ever a word starts with a '#' symbol, it becomes a link. I am using python with Google app engine: webapp2 and Jinja2 and I am building a blog.
Thanks
A more efficient and complete way to find the "hashwords":
import functools
def hash_position(string):
return string.find('#')
def delimiter_position(string, delimiters):
positions = filter(lambda x: x >= 0, map(lambda delimiter: string.find(delimiter), delimiters))
try:
return functools.reduce(min, positions)
except TypeError:
return -1
def get_hashed_words(string, delimiters):
maximum_length = len(string)
current_hash_position = hash_position(string)
string = string[current_hash_position:]
results = []
counter = 0
while current_hash_position != -1:
current_delimiter_position = delimiter_position(string, delimiters)
if current_delimiter_position == -1:
results.append(string)
else:
results.append(string[0:current_delimiter_position])
# Update offsets and the haystack
string = string[current_delimiter_position:]
current_hash_position = hash_position(string)
string = string[current_hash_position:]
return results
if __name__ == "__main__":
string = "Please #clarify: What do you #mean with returning somthing as a #link. #herp"
delimiters = [' ', '.', ',', ':']
print(get_hashed_words(string, delimiters))
Imperative code with updates of the haystack looks a little bit ugly but hey, that's what we get for (ab-)using mutable variables.
And I still have no idea what do you mean with "returning something as a link".
Hope that helps.
not sure where do you get the data for the link, but maybe something like:
[('%s' % word) for word in input.split() if word[0]=='#']
Are you talking about twitter? Maybe this?
def get_hashtag_link(hashtag):
if hashtag.startswith("#"):
return '%s' % (hashtag[1:], hashtag)
>>> get_hashtag_link("#stackoverflow")
'#stackoverflow'
It will return None if hashtag is not a hashtag.

search and replace text inline in file in Python

I am trying to convert a file which contains ip address in the traditional format to a file which contains ip address in the binary format.
The file contents are as follows.
src-ip{ 192.168.64.54 }
dst-ip{ 192.168.43.87 }
The code I have is as follows.
import re
from decimal import *
filter = open("filter.txt", "r")
output = open("format.txt", "w")
for line in filter:
bytePattern = "([01]?\d\d?|2[0-4]\d|25[0-5])"
regObj = re.compile("\.".join([bytePattern]*4))
for match in regObj.finditer(line):
m1,m2,m3,m4 = match.groups()
line = line.replace((' '.join([bin(256 + int(x))[3:] for x in '123.123.123.123'.split('.')])),bytePattern)
print line
The portion line.replace() does not seem to be working fine. The first parameter to line .replace is working fine.(i.e it is converting the ip address into the binary format)
But line.replace doesn't seem to work. Any help or clues as to why this happens is appreciated.
with open('filter.txt') as filter_:
with open("format.txt", "w") as format:
for line in filter_:
if line != '\n':
ip = line.split()
ip[1] = '.'.join(bin(int(x)+256)[3:] for x in ip[1].split('.'))
ip[4]= '.'.join(bin(int(x)+256)[3:] for x in ip[4].split('.'))
ip = " ".join(ip) + '\n'
format.write(ip)
Why not take advantage of re.sub() instead, to both make your replacements easier and simplify your regex?
import re
from decimal import *
filter = open("filter.txt", "r")
output = open("format.txt", "w")
pattern = re.compile(r'[\d.]+') # Matches any sequence of digits and .'s
def convert_match_to_binary(match)
octets = match.group(0).split('.')
# do something here to convert the octets to a string you want to replace
# this IP with, and store it in new_form
return new_form
for line in filter:
line = pattern.sub(convert_match_to_binary, line)
print line
Your code is very odd:
line = line.replace(
(' '.join([bin(256 + int(x))[3:] for x in '123.123.123.123'.split('.')])),
bytePattern
)
The first argument is a constant that evaluates to '01111011 01111011 01111011 01111011', and bytePattern is the regex "([01]?\d\d?|2[0-4]\d|25[0-5])", so it's effectively this:
line = line.replace('01111011 01111011 01111011 01111011', "([01]?\d\d?|2[0-4]\d|25[0-5])")
This won't do anything if your file doesn't have 01111011 01111011 01111011 01111011 in it.
The .replace() method only replaces literal strings, not regexes.
If it is any help here is my old code from DaniWed IP number conversion between dotnumber string and integer with some error check added.
def ipnumber(ip):
if ip.count('.') != 3:
raise ValueError, 'IP string with wrong number of dots'
ip=[int(ipn) for ipn in ip.rstrip().split('.')]
if any(ipn<0 or ipn>255 for ipn in ip):
raise ValueError, 'IP part of wrong value: %s' % ip
ipn=0
while ip:
ipn=(ipn<<8)+ip.pop(0)
return ipn
def ipstring(ip):
ips=''
for i in range(4):
ip,n=divmod(ip,256)
print n
if (n<0) or (n>255):
raise ValueError, "IP number %i is not valid (%s, %i)." % (ip,ips,n)
ips = str(n)+'.'+ips
return ips[:-1] ## take out extra point
inp = "src-ip{ 192.168.64.544 } dst-ip{ 192.168.43.87 }"
found=' '
while found:
_,found,ip = inp.partition('-ip{ ')
ip,found,inp = ip.partition(' }')
if ip:
print ipnumber(ip)

Categories

Resources