Python lxml soupparser fails with "Invalid PI name"

Python lxml soupparser fails with "Invalid PI name" - python

I'm parsing a site and the following fails with an error (I'm not able to resolve with Google):
from lxml.html.soupparser import fromstring
# etree.LXML_VERSION = (4, 1, 1, 0)
# www.hbs-info.de /produkte /schweisselemente.html
fromstring(open(r"HBS Schweißelemente.htm").read())
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-caba4799682e> in <module>()
1 from lxml.html.soupparser import fromstring
----> 2 fromstring(open(r"HBS Schweißelemente.htm").read())
\lib\site-packages\lxml\html\soupparser.py in fromstring(data, beautifulsoup, makeelement, **bsargs)
31 used.
32 """
---> 33 return _parse(data, beautifulsoup, makeelement, **bsargs)
34
35
\lib\site-packages\lxml\html\soupparser.py in _parse(source, beautifulsoup, makeelement, **bsargs)
77 bsargs['features'] = 'html.parser' # use Python html parser
78 tree = beautifulsoup(source, **bsargs)
---> 79 root = _convert_tree(tree, makeelement)
80 # from ET: wrap the document in a html root element, if necessary
81 if len(root) == 1 and root[0].tag == "html":
\lib\site-packages\lxml\html\soupparser.py in _convert_tree(beautiful_soup_tree, makeelement)
153 prev = res_root
154 for e in reversed(pre_root):
--> 155 converted = convert_node(e)
156 if converted is not None:
157 prev.addprevious(converted)
\lib\site-packages\lxml\html\soupparser.py in convert_node(bs_node, parent)
214 if handler is None:
215 return None
--> 216 return handler(bs_node, parent)
217
218 def map_attrs(bs_attrs):
\lib\site-packages\lxml\html\soupparser.py in convert_pi(bs_node, parent)
271 # interpreted it as being SGML style (<?as df>). Fix.
272 bs_node = bs_node[:-1]
--> 273 res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
274 if parent is not None:
275 parent.append(res)
src/lxml/etree.pyx in lxml.etree.ProcessingInstruction (src\lxml\etree.c:79300)()
ValueError: Invalid PI name 'b'xml''
What could be the cause?

Related

create_collection() got an unexpected keyword argument 'embedding_fn'

I was trying to use the langchain library to create a question answering system. But when I try to search in the document using the chromadb library it gives this error:
TypeError: create_collection() got an unexpected keyword argument 'embedding_fn'
Here's the code am working on
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
loader = TextLoader('./info.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings).
The last line generates the error.
This is the complete error message:
TypeError Traceback (most recent call last)
Input In [36], in <cell line: 1>()
----> 1 docsearch = Chroma.from_documents(texts, embeddings)
File ~\anaconda3\lib\site-packages\langchain\vectorstores\chroma.py:212, in Chroma.from_documents(cls, documents, embedding, ids, collection_name, persist_directory, **kwargs)
210 texts = [doc.page_content for doc in documents]
211 metadatas = [doc.metadata for doc in documents]
--> 212 return cls.from_texts(
213 texts=texts,
214 embedding=embedding,
215 metadatas=metadatas,
216 ids=ids,
217 collection_name=collection_name,
218 persist_directory=persist_directory,
219 )
File ~\anaconda3\lib\site-packages\langchain\vectorstores\chroma.py:178, in Chroma.from_texts(cls, texts, embedding, metadatas, ids, collection_name, persist_directory, **kwargs)
151 #classmethod
152 def from_texts(
153 cls,
(...)
160 **kwargs: Any,
161 ) -> Chroma:
162 """Create a Chroma vectorstore from a raw documents.
163
164 If a persist_directory is specified, the collection will be persisted there.
(...)
176 Chroma: Chroma vectorstore.
177 """
--> 178 chroma_collection = cls(
179 collection_name=collection_name,
180 embedding_function=embedding,
181 persist_directory=persist_directory,
182 )
183 chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
184 return chroma_collection
File ~\anaconda3\lib\site-packages\langchain\vectorstores\chroma.py:65, in Chroma.__init__(self, collection_name, embedding_function, persist_directory)
60 logger.warning(
61 f"Collection {collection_name} already exists,"
62 " Do you have the right embedding function?"
63 )
64 else:
---> 65 self._collection = self._client.create_collection(
66 name=collection_name,
67 embedding_fn=self._embedding_function.embed_documents
68 if self._embedding_function is not None
69 else None,
70 )
TypeError: create_collection() got an unexpected keyword argument 'embedding_fn'

The create_collection method of chromadb.Client was changed 2 days ago and the embedding_fn parameter was renamed to embedding_function:
https://github.com/chroma-core/chroma/commit/6ce2388e219d47048e854be72be54617df647224
The source code for the langchain.vectorstores.chroma.Chroma class as of version 0.0.87 seems to have been updated already (3 hours before you asked the question) to match the chromadb library:
https://github.com/hwchase17/langchain/commit/34cba2da3264ccc9100f7efd16807c8d2a51734c
So you should be able to fix the problem by installing the newest version of LangChain.

Accessing Shared Mailbox Using Exchangelib — Python

Trying to Access a Shared Folder using the following code :
credentials = Credentials(username = user_name, password = "secret")
config = Configuration(server ='outlook.office365.com', credentials = credentials, auth_type=NTLM)
account = Account(primary_smtp_address = 'shared_mail#domain.com', credentials = credentials, autodiscover = False, config = config, access_type = DELEGATE,)
The above three lines of Code work perfectly but we are unable to get the root,
the following code : account.root.tree() or account.root throws the following error:
KeyError Traceback (most recent call last)
~\anaconda3\lib\site-packages\cached_property.py in __get__(self, obj, cls)
68 # check if the value was computed before the lock was acquired
---> 69 return obj_dict[name]
70
KeyError: 'root'
During handling of the above exception, another exception occurred:
ErrorNonExistentMailbox Traceback (most recent call last)
<ipython-input-46-a90a4f76ca21> in <module>
2 logging.basicConfig(level=logging.DEBUG)
3
----> 4 account.root.tree()
~\anaconda3\lib\site-packages\cached_property.py in __get__(self, obj, cls)
71 except KeyError:
72 # if not, do the calculation and release the lock
---> 73 return obj_dict.setdefault(name, self.func(obj))
74
75
~\anaconda3\lib\site-packages\exchangelib\account.py in root(self)
268 #threaded_cached_property
269 def root(self):
--> 270 return Root.get_distinguished(account=self)
271
272 #threaded_cached_property
~\anaconda3\lib\site-packages\exchangelib\folders\roots.py in get_distinguished(cls, account)
107 return cls.resolve(
108 account=account,
--> 109 folder=cls(account=account, name=cls.DISTINGUISHED_FOLDER_ID, is_distinguished=True)
110 )
111 except ErrorFolderNotFound:
~\anaconda3\lib\site-packages\exchangelib\folders\base.py in resolve(cls, account, folder)
485 def resolve(cls, account, folder):
486 # Resolve a single folder
--> 487 folders = list(FolderCollection(account=account, folders=[folder]).resolve())
488 if not folders:
489 raise ErrorFolderNotFound('Could not find folder %r' % folder)
~\anaconda3\lib\site-packages\exchangelib\folders\collections.py in resolve(self)
254 additional_fields = self.get_folder_fields(target_cls=self._get_target_cls(), is_complex=None)
255 for f in self.__class__(account=self.account, folders=resolveable_folders).get_folders(
--> 256 additional_fields=additional_fields
257 ):
258 yield f
~\anaconda3\lib\site-packages\exchangelib\folders\collections.py in get_folders(self, additional_fields)
317 folders=self.folders,
318 additional_fields=additional_fields,
--> 319 shape=ID_ONLY,
320 ):
321 yield f
~\anaconda3\lib\site-packages\exchangelib\services\get_folder.py in call(self, folders, additional_fields, shape)
32 **dict(
33 additional_fields=additional_fields,
---> 34 shape=shape,
35 )
36 )):
~\anaconda3\lib\site-packages\exchangelib\services\common.py in _pool_requests(self, payload_func, items, **kwargs)
538 for i, chunk in enumerate(chunkify(items, self.chunk_size), start=1):
539 log.debug('Processing %s chunk %s containing %s items', self.__class__.__name__, i, len(chunk))
--> 540 for elem in self._get_elements(payload=payload_func(chunk, **kwargs)):
541 yield elem
542
~\anaconda3\lib\site-packages\exchangelib\services\common.py in _get_elements_in_response(self, response)
401 def _get_elements_in_response(self, response):
402 for msg in response:
--> 403 container_or_exc = self._get_element_container(message=msg, name=self.element_container_name)
404 if isinstance(container_or_exc, (bool, Exception)):
405 yield container_or_exc
~\anaconda3\lib\site-packages\exchangelib\services\common.py in _get_element_container(self, message, response_message, name)
360 # rspclass == 'Error', or 'Success' and not 'NoError'
361 try:
--> 362 raise self._get_exception(code=response_code, text=msg_text, msg_xml=msg_xml)
363 except self.ERRORS_TO_CATCH_IN_RESPONSE as e:
364 return e
ErrorNonExistentMailbox: Mailbox does not exist.
The same code seems to be working here : https://medium.com/#theamazingexposure/accessing-shared-mailbox-using-exchangelib-python-f020e71a96ab
Also checked this thread https://github.com/ecederstrand/exchangelib/issues/391 and tried almost all the solutions but facing the same error.

Altair: NoMatchingVersions when saving maps with selenium

I am trying to create a series and save them iteratively.
The creation works well but while saving I get the following error:
---------------------------------------------------------------------------
NoMatchingVersions Traceback (most recent call last)
<ipython-input-103-e75c3f4b4fa5> in <module>
29 chart=(background + chart).configure_view(stroke='white')
30 filename = f"{scenario}.svg"
---> 31 save(chart, filename, method='selenium', webdriver=driver)
~\Anaconda3\lib\site-packages\altair_saver\_core.py in save(chart, fp, fmt, mode, method, **kwargs)
75 saver = Saver(spec, mode=mode, **kwargs)
76
---> 77 saver.save(fp=fp, fmt=fmt)
78
79
~\Anaconda3\lib\site-packages\altair_saver\savers\_saver.py in save(self, fp, fmt)
86 raise ValueError(f"Got fmt={fmt}; expected one of {self.valid_formats}")
87
---> 88 content = self.mimebundle(fmt).popitem()[1]
89 if isinstance(content, dict):
90 with maybe_open(fp, "w") as f:
~\Anaconda3\lib\site-packages\altair_saver\savers\_saver.py in mimebundle(self, fmts)
66 f"invalid fmt={fmt!r}; must be one of {self.valid_formats}."
67 )
---> 68 bundle.update(self._mimebundle(fmt))
69 return bundle
70
~\Anaconda3\lib\site-packages\altair_saver\savers\_selenium.py in _mimebundle(self, fmt)
249
250 def _mimebundle(self, fmt: str) -> Mimebundle:
--> 251 out = self._extract(fmt)
252 mimetype = fmt_to_mimetype(
253 fmt,
~\Anaconda3\lib\site-packages\altair_saver\savers\_selenium.py in _extract(self, fmt)
209 js_resources = {
210 "vega.js": get_bundled_script("vega", self._vega_version),
--> 211 "vega-lite.js": get_bundled_script("vega-lite", self._vegalite_version),
212 "vega-embed.js": get_bundled_script(
213 "vega-embed", self._vegaembed_version
~\Anaconda3\lib\site-packages\altair_viewer\_scripts.py in get_bundled_script(package, version)
36 f"package {package!r} not recognized. Available: {list(listing)}"
37 )
---> 38 version_str = find_version(version, listing[package])
39 content = pkgutil.get_data("altair_viewer", f"scripts/{package}-{version_str}.js")
40 if content is None:
~\Anaconda3\lib\site-packages\altair_viewer\_utils.py in find_version(version, candidates, strict_micro)
190 if not matches:
191 raise NoMatchingVersions(
--> 192 f"No matches for version={version!r} among {candidates}"
193 )
194 return str(matches[-1])
NoMatchingVersions: No matches for version='4.8.1' among ['4.0.2']
I am using selenium and altair_saver:
from altair_saver import save
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r'pathtochromedriver/chromedriver_win32/chromedriver.exe')
for i, scenario in enumerate(scenario_columns):
chart=makechart(scenario, i)
filename = f"{scenario}.svg"
save(chart, filename, method='selenium', webdriver=driver)
Here `scenario` is a string without special characters.

You need to update the altair_viewer package to a newer version:
$ pip install -U altair_viewer
(This error was improved in https://github.com/altair-viz/altair_viewer/pull/33, so shouldn't be as mysterious when it comes up in the future).

what is the problem ?: application.connect() error

I am a beginner developer who started to study automation by using pywinauto.
An overflow error occurs when using application.connect () to connect to an already open program.
But application.start() works fine....
Please help me if someone know this part.
The source code and error contents are as follows.
Source code:
import pywinauto
app = pywinauto.application.Application()
app.connect(title_re='Calculator')
Error:
OverflowError Traceback (most recent call last)
in
1 import pywinauto
2 app = pywinauto.application.Application()
----> 3 app.connect(title_re='Calculator')
d:\Anaconda3\lib\site-packages\pywinauto\application.py in connect(self, **kwargs)
972 ).process_id
973 else:
--> 974 self.process = findwindows.find_element(**kwargs).process_id
975 connected = True
976
d:\Anaconda3\lib\site-packages\pywinauto\findwindows.py in find_element(**kwargs)
82 so please see :py:func:find_elements for the full parameters description.
83 """
---> 84 elements = find_elements(**kwargs)
85
86 if not elements:
d:\Anaconda3\lib\site-packages\pywinauto\findwindows.py in find_elements(class_name, class_name_re, parent, process, title, title_re, top_level_only, visible_only, enabled_only, best_match, handle, ctrl_index, found_index, predicate_func, active_only, control_id, control_type, auto_id, framework_id, backend, depth)
279 return title_regex.match(t)
280 return False
--> 281 elements = [elem for elem in elements if _title_match(elem)]
282
283 if visible_only:
d:\Anaconda3\lib\site-packages\pywinauto\findwindows.py in (.0)
279 return title_regex.match(t)
280 return False
--> 281 elements = [elem for elem in elements if _title_match(elem)]
282
283 if visible_only:
d:\Anaconda3\lib\site-packages\pywinauto\findwindows.py in _title_match(w)
275 def _title_match(w):
276 """Match a window title to the regexp"""
--> 277 t = w.rich_text
278 if t is not None:
279 return title_regex.match(t)
d:\Anaconda3\lib\site-packages\pywinauto\win32_element_info.py in rich_text(self)
81 def rich_text(self):
82 """Return the text of the window"""
---> 83 return handleprops.text(self.handle)
84
85 name = rich_text
d:\Anaconda3\lib\site-packages\pywinauto\handleprops.py in text(handle)
86 length += 1
87
---> 88 buffer_ = ctypes.create_unicode_buffer(length)
89
90 ret = win32functions.SendMessage(
d:\Anaconda3\lib\ctypes_init_.py in create_unicode_buffer(init, size)
286 return buf
287 elif isinstance(init, int):
--> 288 buftype = c_wchar * init
289 buf = buftype()
290 return buf
OverflowError: cannot fit 'int' into an index-sized integer

import pywinauto
app = pywinauto.Application(backend='uia').start('calc.exe')
try this if you are having problem you have to say the backennd is a uia it working fine for me.

Rdflib namespaces - how to avoid 'Can't split URI' error?

I would like to define my own namespace "http://example.org/" in rdflib, but apparently that can't be done. Can't figure out what is the proper way to do it...
In [1]: import rdflib
INFO:rdflib:RDFLib Version: 4.2.2
In [2]: g = rdflib.Graph()
In [3]: from rdflib import Namespace
In [4]: n1 = Namespace("http://example.org/")
In [5]: u1 = n1['1']
In [6]: u1
Out[6]: rdflib.term.URIRef(u'http://example.org/1')
In [7]: g.bind('ex', n1)
In [8]: g.add((u1, u1, u1)
...: )
In [9]: g.serialize()
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-9-25a09aa9a7b5> in <module>()
----> 1 g.serialize()
/usr/local/lib/python2.7/site-packages/rdflib/graph.pyc in serialize(self, destination, format, base, encoding, **args)
937 if destination is None:
938 stream = BytesIO()
--> 939 serializer.serialize(stream, base=base, encoding=encoding, **args)
940 return stream.getvalue()
941 if hasattr(destination, "write"):
/usr/local/lib/python2.7/site-packages/rdflib/plugins/serializers/rdfxml.pyc in serialize(self, stream, base, encoding, **args)
64 # assert(
65 # namespaces["http://www.w3.org/1999/02/22-rdf-syntax-ns#"]=='rdf')
---> 66 bindings = list(self.__bindings())
67 bindings.sort()
68
/usr/local/lib/python2.7/site-packages/rdflib/plugins/serializers/rdfxml.pyc in __bindings(self)
31
32 for predicate in set(store.predicates()):
---> 33 prefix, namespace, name = nm.compute_qname(predicate)
34 bindings[prefix] = URIRef(namespace)
35
/usr/local/lib/python2.7/site-packages/rdflib/namespace.pyc in compute_qname(self, uri, generate)
328
329 if not uri in self.__cache:
--> 330 namespace, name = split_uri(uri)
331 namespace = URIRef(namespace)
332 prefix = self.store.prefix(namespace)
/usr/local/lib/python2.7/site-packages/rdflib/namespace.pyc in split_uri(uri)
500 return (ns, ln)
501 break
--> 502 raise Exception("Can't split '%s'" % uri)
Exception: Can't split 'http://example.org/1'

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python lxml soupparser fails with "Invalid PI name" - python

Related

create_collection() got an unexpected keyword argument 'embedding_fn'

Accessing Shared Mailbox Using Exchangelib — Python

Altair: NoMatchingVersions when saving maps with selenium

what is the problem ?: application.connect() error

Rdflib namespaces - how to avoid 'Can't split URI' error?

Categories

Resources