Python HTMLParser Not Reading Whole File - python

from HTMLParser import HTMLParser
class HTMLParserDos(HTMLParser):
full_text = ""
def handle_data(self, data):
self.full_text += data
return self.full_text
h = HTMLParserDos()
file = open('emails.txt', 'r')
h.feed(file.read())
file.close()
print h.container
This code is getting an error:
Traceback (most recent call last): File "/Users/laurenstrom/Google
Drive/PYTHON/RANDO_CALRISSIAN/html_parse", line 15, in
h.feed(file.read()) File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py",
line 108, in feed
self.goahead(0) File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py",
line 148, in goahead
k = self.parse_starttag(i) File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py",
line 229, in parse_starttag
endpos = self.check_for_whole_start_tag(i) File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py",
line 304, in check_for_whole_start_tag
self.error("malformed start tag") File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py",
line 115, in error
raise HTMLParseError(message, self.getpos()) HTMLParseError: malformed start tag, at line 7, column 18
I'm not sure what I'm missing about .feed() but I can't seem to find anything about why it won't just read the whole file.

Your are asking the HTML parser to parse a file most of which isn't HTML. It is tripping over line 7 of your file. Which is :
Return-Path: <Tom#sjnetworkconsulting.com>
I would imagine it is seeing the < and assuming that is HTML which of course it is not.

Related

Scraping list from webpage

I am trying to extract items in 'Symbol' column for this webpage:
https://chartink.com/screener/2-short-trend
My code is like this:
from requests_html import HTMLSession
def stockList(url):
session = HTMLSession()
r = session.get(url)
r.html.render(sleep=1)
stock = [{item.text} for item in r.html.xpath('//*[#class="//*[#id="DataTables_Table_0"]/tbody/tr[1]/td[3]/a"]')]
return stock
listStock = stockList('https://chartink.com/screener/2-short-trend')
print(listStock)
Error:
Traceback (most recent call last):
File "C:\Users\kashk\PycharmProjects\test\venv\Scripts\CIscreen2pcShort.py", line 12, in <module>
listStock = stockList('https://chartink.com/screener/2-short-trend')
File "C:\Users\kashk\PycharmProjects\test\venv\Scripts\CIscreen2pcShort.py", line 10, in stockList
stock = [{item.text} for item in r.html.xpath('//*[#class="//*[#id="DataTables_Table_0"]/tbody/tr[1]/td[3]/a"]')]
File "C:\Users\kashk\PycharmProjects\test\venv\lib\site-packages\requests_html.py", line 255, in xpath
selected = self.lxml.xpath(selector)
File "src\lxml\etree.pyx", line 1597, in lxml.etree._Element.xpath
File "src\lxml\xpath.pxi", line 305, in lxml.etree.XPathElementEvaluator.__call__
File "src\lxml\xpath.pxi", line 225, in lxml.etree._XPathEvaluatorBase._handle_result
lxml.etree.XPathEvalError: Invalid predicate
Can you pls suggest where am I going wrong.
Also, is there a way to send this output to a dataframe.
File "src\lxml\etree.pyx", line 1597, in lxml.etree._Element.xpath
File "src\lxml\xpath.pxi", line 305, in lxml.etree.XPathElementEvaluator.call
File "src\lxml\xpath.pxi", line 225, in lxml.etree._XPathEvaluatorBase._handle_result
lxml.etree.XPathEvalError: Invalid expression

Problem with adding Excel files at Pandas | wrapper return func

Hi everybody I have a problem uploading a excel file with Pandas
I have taken the file in archive, if I uploaded it directly it gaves me an error. If I cope and paste the excel file there is no problem.
The code is very easy:
data = pd.read_excel(r"C:\Users\obett\Desktop\Corporate Governance\pandas.xlsx")
and this is the error:
Traceback (most recent call last):
File "C:/Users/obett/PycharmProjects/pythonProject6/main.py", line 24, in <module>
data = pd.read_excel(r"C:\Users\obett\Desktop\Corporate Governance\Aida_Export_67.xlsx")
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\util\_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 344, in read_excel
data = io.parse(
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 1170, in parse
return self._reader.parse(
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 492, in parse
data = self.get_sheet_data(sheet, convert_float)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 549, in get_sheet_data
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 549, in <listcomp>
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 514, in _convert_cell
elif cell.is_date:
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 101, in is_date
return Cell.is_date.__get__(self)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\cell.py", line 256, in is_date
self.data_type == 'n' and is_date_format(self.number_format)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 66, in number_format
_id = self.style_array.numFmtId
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 56, in style_array
return self.parent.parent._cell_styles[self._style_id]
IndexError: list index out of range
Thank you very much

Unable to decode yml file ... utf8' codec can't decode byte #xa0: invalid start byte

I'm trying to read YAML file and convert it into dictionary file. I'm seeing an issue while loading the file into dict variable.
I tried to search for similar issues. One of the replies in stackoverflow was to replace each character '\\xa0' with ' '. I tried do that line = line.replace('\\xa0',' '). This program doesn't work on Python 2.7 version. I tried using Python 3 it works fine.
import yaml
import sys
yaml_dir = "/root/tools/test_case/"
#file_name = "TC_CFD_SR.yml"
file_name = "TC_QB.yml"
tc_file_name = yaml_dir + file_name
def write(file,content):
file = open(file,'a')
file.write(content)
file.close()
def verifyYmlFile(yml_file):
data = {}
with open(yml_file, 'r') as fin:
for line in fin:
line = line.replace('\\xa0',' ')
write('anand-yaml.yml',line)
with open('anand-yaml.yml','r') as fin:
data = yaml.load(fin)
return data
if __name__ == '__main__':
data = {}
print "verifying yaml"
data= verifyYmlFile(tc_file_name)
Error:
[root#anand-harness test_case]# python verify_yaml.py
verifying yaml
Traceback (most recent call last):
File "verify_yaml.py", line 29, in <module>
data= verifyYmlFile(tc_file_name)
File "verify_yaml.py", line 23, in verifyYmlFile
data = yaml.load(fin)
File "/usr/lib64/python2.6/site-packages/yaml/__init__.py", line 71, in load
return loader.get_single_data()
File "/usr/lib64/python2.6/site-packages/yaml/constructor.py", line 37, in get_single_data
node = self.get_single_node()
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 36, in get_single_node
document = self.compose_document()
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 55, in compose_document
node = self.compose_node(None, None)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 82, in compose_node
node = self.compose_sequence_node(anchor)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 111, in compose_sequence_node
node.value.append(self.compose_node(node, index))
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 84, in compose_node
node = self.compose_mapping_node(anchor)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 133, in compose_mapping_node
item_value = self.compose_node(node, item_key)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 64, in compose_node
if self.check_event(AliasEvent):
File "/usr/lib64/python2.6/site-packages/yaml/parser.py", line 98, in check_event
self.current_event = self.state()
File "/usr/lib64/python2.6/site-packages/yaml/parser.py", line 449, in parse_block_mapping_value
if not self.check_token(KeyToken, ValueToken, BlockEndToken):
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 116, in check_token
self.fetch_more_tokens()
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 244, in fetch_more_tokens
return self.fetch_single()
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 653, in fetch_single
self.fetch_flow_scalar(style='\'')
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 667, in fetch_flow_scalar
self.tokens.append(self.scan_flow_scalar(style))
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 1156, in scan_flow_scalar
chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 1196, in scan_flow_scalar_non_spaces
while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
File "/usr/lib64/python2.6/site-packages/yaml/reader.py", line 91, in peek
self.update(index+1)
File "/usr/lib64/python2.6/site-packages/yaml/reader.py", line 165, in update
exc.encoding, exc.reason)
yaml.reader.ReaderError: 'utf8' codec can't decode byte #xa0: invalid start byte
in "anand-yaml.yml", position 3246
What am I missing?
The character sequence "\\xa0" is not the problem that you see in the message, the problem is the sequence "\xa0" (note that the backslash is not escaped).
You replacement line should be:
line = line.replace('\xa0',' ')
to circumvent the problem.
If you know what the format is you can do the correct conversion yourself, but that should not be necessary and that or the above patching is not a structural solution. It would be best if the YAML file was generated in a correct way (they default to UTF-8, so it should contain correct UTF-8). It could UTF-16 without the appropriate BOM (which the yaml library interprets IIRC).
s1 = 'abc\\xa0xyz'
print(repr(s1))
u1 = s1.decode('utf-8') # this works fine
s = 'abc\xa0xyz'
print(repr(s))
u = s.decode('utf-8') # this throws an error

ConfigParser.MissingSectionHeaderError when reading config file Python

I am trying to read some values from a config file params.txt using ConfigParser in Python but keep getting a MissingSectionHeadError
I have a file params.txt:
[all]
zigzag = 0.08
fractal = 0.03
rng_length = 1000
stp = 100
and the following code:
parser = cp.SafeConfigParser()
g = open(params, 'r')
g.readline()
parser.readfp(g)
print parser.getfloat('all', zigzag)
where I am getting this error:
Traceback (most recent call last):
File "deadrabbit_console_0-1.py", line 166, in <module>
DRconsole().cmdloop()
File "/usr/lib/python2.7/cmd.py", line 142, in cmdloop
stop = self.onecmd(line)
File "/usr/lib/python2.7/cmd.py", line 221, in onecmd
return func(arg)
File "deadrabbit_console_0-1.py", line 127, in do_load_data
get_data(series, params)
File "deadrabbit_console_0-1.py", line 115, in get_data
parser.readfp(g)
File "/usr/lib/python2.7/ConfigParser.py", line 324, in readfp
self._read(fp, filename)
File "/usr/lib/python2.7/ConfigParser.py", line 512, in _read
raise MissingSectionHeaderError(fpname, lineno, line)
ConfigParser.MissingSectionHeaderError: File contains no section headers.
file: /home/baconwichsand/Documents/Dead Rabbit/params.txt, line: 1
'zigzag = 0.08\n'
What's wrong?
For some reason you are doing:
g.readline()
before passing the file to readfp. This will read the line containing [all] so when SafeConfigParser reads the file it will not read the section header, and you receive that error. To fix it simply don't call readline():
In [4]: parser = cp.SafeConfigParser()
...: with open('data.ini', 'r') as g:
...: parser.readfp(g)
...: print(parser.getfloat('all', 'zigzag'))
0.08

How to fix encoding in Python Mechanize?

here is the sample code:
from mechanize import Browser
br = Browser()
page = br.open('http://hunters.tclans.ru/news.php?readmore=2')
br.form = br.forms().next()
print br.form
The problem is that server return incorrect encoding (windows-cp1251). How can I manually set the encoding of the current page in mechanize?
Error:
Traceback (most recent call last):
File "/tmp/stackoverflow.py", line 5, in <module>
br.form = br.forms().next()
File "/usr/local/lib/python2.6/dist-packages/mechanize/_mechanize.py", line 426, in forms
return self._factory.forms()
File "/usr/local/lib/python2.6/dist-packages/mechanize/_html.py", line 559, in forms
self._forms_factory.forms())
File "/usr/local/lib/python2.6/dist-packages/mechanize/_html.py", line 225, in forms
_urlunparse=_rfc3986.urlunsplit,
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 967, in ParseResponseEx
_urlunparse=_urlunparse,
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 1104, in _ParseFileEx
fp.feed(data)
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 870, in feed
sgmllib.SGMLParser.feed(self, data)
File "/usr/lib/python2.6/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.6/sgmllib.py", line 193, in goahead
self.handle_entityref(name)
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 751, in handle_entityref
'&%s;' % name, self._entitydefs, self._encoding))
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 238, in unescape
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
File "/usr/lib/python2.6/re.py", line 151, in sub
return _compile(pattern, 0).sub(repl, string, count)
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 230, in replace_entities
repl = repl.encode(encoding)
LookupError: unknown encoding: windows-cp1251
I don't know about Mechanize, but you could hack codecs to accept wrong encoding names that have both ‘windows’ and ‘cp’:
>>> def fixcp(name):
... if name.lower().startswith('windows-cp'):
... try:
... return codecs.lookup(name[:8]+name[10:])
... except LookupError:
... pass
... return None
...
>>> codecs.register(fixcp)
>>> '\xcd\xe0\xef\xee\xec\xe8\xed\xe0\xe5\xec'.decode('windows-cp1251')
u'Напоминаем'
Fixed by setting
br._factory.encoding = enc
br._factory._forms_factory.encoding = enc
br._factory._links_factory._encoding = enc
(note the underscores) after br.open()

Categories

Resources