I've got a question during following the simple gensim tutorial on gensim website,
>>> from gensim.test.utils import common_texts, get_tmpfile
>>> from gensim.models import Word2Vec
>>>
>>> path = get_tmpfile("word2vec.model")
>>>
>>> model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
>>> model.save("word2vec.model")
>>> model = Word2Vec.load("word2vec.model")
>>> model.train([["hello", "world"]], total_examples=1, epochs=1)
>>> from gensim.models import KeyedVectors
>>>
>>> path = get_tmpfile("wordvectors.kv")
>>>
And when I tried below,
>>> model.wv.save(path)
>>> wv = KeyedVectors.load("model.wv", mmap='r')
I've got a following error :
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-81-eee6865b677b> in <module>
1 path = get_tmpfile('wordvectors.kv')
2 model.wv.save(path)
----> 3 KeyedVectors.load("model.wv",mmap='r')
/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
210 #classmethod
211 def load(cls, fname_or_handle, **kwargs):
--> 212 return super(BaseKeyedVectors, cls).load(fname_or_handle, **kwargs)
213
214 def similarity(self, entity1, entity2):
/anaconda3/lib/python3.7/site-packages/gensim/utils.py in load(cls, fname, mmap)
420 compress, subname = SaveLoad._adapt_by_suffix(fname)
421
--> 422 obj = unpickle(fname)
423 obj._load_specials(fname, mmap, compress, subname)
424 logger.info("loaded %s", fname)
/anaconda3/lib/python3.7/site-packages/gensim/utils.py in unpickle(fname)
1356
1357 """
-> 1358 with smart_open(fname, 'rb') as f:
1359 # Because of loading from S3 load can't be used (missing readline in smart_open)
1360 if sys.version_info > (3, 0):
/anaconda3/lib/python3.7/site-packages/smart_open/smart_open_lib.py in smart_open(uri, mode, **kw)
179 raise TypeError('mode should be a string')
180
--> 181 fobj = _shortcut_open(uri, mode, **kw)
182 if fobj is not None:
183 return fobj
/anaconda3/lib/python3.7/site-packages/smart_open/smart_open_lib.py in _shortcut_open(uri, mode, **kw)
299 #
300 if six.PY3:
--> 301 return open(parsed_uri.uri_path, mode, buffering=buffering, **open_kwargs)
302 elif not open_kwargs:
303 return open(parsed_uri.uri_path, mode, buffering=buffering)
FileNotFoundError: [Errno 2] No such file or directory: 'model.wv'
Does anyone know the reason for this message? How can I know that I do have 'model.wv' file?
Thank you in advance!
Change it from: wv = KeyedVectors.load("model.wv", mmap='r')
to: wv = KeyedVectors.load(path, mmap='r')
You should be loading the file 'wordvectors.kv'
Related
I am trying to save the GPT2 tokenizer as follows:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = GPT2Tokenizer.eos_token
dataset_file = "x.csv"
df = pd.read_csv(dataset_file, sep=",")
input_ids = tokenizer.batch_encode_plus(list(df["x"]), max_length=1024,padding='max_length',truncation=True)["input_ids"]
# saving the tokenizer
tokenizer.save_pretrained("tokenfile")
I am getting the following error:
TypeError: Object of type property is not JSON serializable
More details:
TypeError Traceback (most recent call last)
Cell In[x], line 3
1 # Save the fine-tuned model
----> 3 tokenizer.save_pretrained("tokenfile")
File /3tb/share/anaconda3/envs/ak_env/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2130, in PreTrainedTokenizerBase.save_pretrained(self, save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
2128 write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
2129 with open(special_tokens_map_file, "w", encoding="utf-8") as f:
-> 2130 out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
2131 f.write(out_str)
2132 logger.info(f"Special tokens file saved in {special_tokens_map_file}")
File /3tb/share/anaconda3/envs/ak_env/lib/python3.10/json/__init__.py:238, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
232 if cls is None:
233 cls = JSONEncoder
234 return cls(
235 skipkeys=skipkeys, ensure_ascii=ensure_ascii,
236 check_circular=check_circular, allow_nan=allow_nan, indent=indent,
237 separators=separators, default=default, sort_keys=sort_keys,
--> 238 **kw).encode(obj)
File /3tb/share/anaconda3/envs/ak_env/lib/python3.10/json/encoder.py:201, in JSONEncoder.encode(self, o)
199 chunks = self.iterencode(o, _one_shot=True)
...
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
TypeError: Object of type property is not JSON serializable
How can I solve this issue?
The Problem is on the line:
tokenizer.pad_token = GPT2Tokenizer.eos_token
Here the initializer is wrong, that's why this error occurred.
A simple solution is to modify this line to:
tokenizer.pad_token = tokenizer.eos_token
For the reference purpose, your final code will look like this:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
dataset_file = "x.csv"
df = pd.read_csv(dataset_file, sep=",")
input_ids = tokenizer.batch_encode_plus(list(df["x"]), max_length=1024,padding='max_length',truncation=True)["input_ids"]
# saving the tokenizer
tokenizer.save_pretrained("tokenfile")
hello I'm having trouble with the weighting part of word2vec and I don't know what's wrong because I'm still a beginner, here the code
w2vModel = word2vec.KeyedVectors.load_word2vec_format("cc.id.300.vec", binary=False, limit=50000)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets_split)
X = tokenizer.texts_to_sequences(tweets_split)
maxlentweet = 100
X = pad_sequences(X, maxlen=maxlentweet)
print(X.shape)
# w2vModel.wv['happy']
2022-03-10 21:37:27,682 : INFO : loading projection weights from cc.id.300.vec
and the error description is
FileNotFoundError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_4960\3085887112.py in <module>
----> 1 w2vModel = word2vec.KeyedVectors.load_word2vec_format("cc.id.300.vec", binary=False, limit=50000)
2
3 tokenizer = Tokenizer()
4 tokenizer.fit_on_texts(tweets_split)
5 X = tokenizer.texts_to_sequences(tweets_split)
appdata\local\programs\python\python37\lib\site-packages\gensim\models\keyedvectors.py in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype, no_header)
1629 return _load_word2vec_format(
1630 cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
-> 1631 limit=limit, datatype=datatype, no_header=no_header,
appdata\local\programs\python\python37\lib\site-packages\gensim\models\keyedvectors.py in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype, no_header, binary_chunk_size)
1954 logger.info("loading projection weights from %s", fname)
-> 1955 with utils.open(fname, 'rb') as fin:
1956 if no_header:
1957 # deduce both vocab_size & vector_size from 1st pass over file
appdata\local\programs\python\python37\lib\site-packages\smart_open\smart_open_lib.py in open(uri, mode, buffering, encoding, errors, newline, closefd, opener, ignore_ext, compression, transport_params)
193 encoding=encoding,
194 errors=errors,
--> 195 newline=newline,
196 )
197 if fobj is not None:
appdata\local\programs\python\python37\lib\site-packages\smart_open\smart_open_lib.py in _shortcut_open(uri, mode, compression, buffering, encoding, errors, newline)
359 open_kwargs['errors'] = errors
--> 361 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs)
FileNotFoundError: [Errno 2] No such file or directory: 'cc.id.300.vec'
please help me
I am following a tutorial here: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
I am at the part "Word2vec and Logistic Regression". I have downloaded the "GoogleNews-vectors-negative300.bin.gz" file and I am tyring to apply it to my own text data. However when I get to the following code:
%%time
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("/data/users/USERS/File_path/classifier/GoogleNews_Embedding/GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)
I run into the following error:
/data/users/msmith/env/lib64/python3.6/site-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function
'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
---------------------------------------------------------------------------
EOFError Traceback (most recent call last)
<timed exec> in <module>
~/env/lib64/python3.6/site-packages/gensim/models/keyedvectors.py in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
1492 return _load_word2vec_format(
1493 cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
-> 1494 limit=limit, datatype=datatype)
1495
1496 def get_keras_embedding(self, train_embeddings=False):
~/env/lib64/python3.6/site-packages/gensim/models/utils_any2vec.py in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
383 with utils.ignore_deprecation_warning():
384 # TODO use frombuffer or something similar
--> 385 weights = fromstring(fin.read(binary_len), dtype=REAL).astype(datatype)
386 add_word(word, weights)
387 else:
/usr/lib64/python3.6/gzip.py in read(self, size)
274 import errno
275 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
--> 276 return self._buffer.read(size)
277
278 def read1(self, size=-1):
/usr/lib64/python3.6/_compression.py in readinto(self, b)
66 def readinto(self, b):
67 with memoryview(b) as view, view.cast("B") as byte_view:
---> 68 data = self.read(len(byte_view))
69 byte_view[:len(data)] = data
70 return len(data)
/usr/lib64/python3.6/gzip.py in read(self, size)
480 break
481 if buf == b"":
--> 482 raise EOFError("Compressed file ended before the "
483 "end-of-stream marker was reached")
484
EOFError: Compressed file ended before the end-of-stream marker was reached
Any idea whats gone wrong/ how to overcome this issue?
Thanks in advance!
I would like to define my own namespace "http://example.org/" in rdflib, but apparently that can't be done. Can't figure out what is the proper way to do it...
In [1]: import rdflib
INFO:rdflib:RDFLib Version: 4.2.2
In [2]: g = rdflib.Graph()
In [3]: from rdflib import Namespace
In [4]: n1 = Namespace("http://example.org/")
In [5]: u1 = n1['1']
In [6]: u1
Out[6]: rdflib.term.URIRef(u'http://example.org/1')
In [7]: g.bind('ex', n1)
In [8]: g.add((u1, u1, u1)
...: )
In [9]: g.serialize()
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-9-25a09aa9a7b5> in <module>()
----> 1 g.serialize()
/usr/local/lib/python2.7/site-packages/rdflib/graph.pyc in serialize(self, destination, format, base, encoding, **args)
937 if destination is None:
938 stream = BytesIO()
--> 939 serializer.serialize(stream, base=base, encoding=encoding, **args)
940 return stream.getvalue()
941 if hasattr(destination, "write"):
/usr/local/lib/python2.7/site-packages/rdflib/plugins/serializers/rdfxml.pyc in serialize(self, stream, base, encoding, **args)
64 # assert(
65 # namespaces["http://www.w3.org/1999/02/22-rdf-syntax-ns#"]=='rdf')
---> 66 bindings = list(self.__bindings())
67 bindings.sort()
68
/usr/local/lib/python2.7/site-packages/rdflib/plugins/serializers/rdfxml.pyc in __bindings(self)
31
32 for predicate in set(store.predicates()):
---> 33 prefix, namespace, name = nm.compute_qname(predicate)
34 bindings[prefix] = URIRef(namespace)
35
/usr/local/lib/python2.7/site-packages/rdflib/namespace.pyc in compute_qname(self, uri, generate)
328
329 if not uri in self.__cache:
--> 330 namespace, name = split_uri(uri)
331 namespace = URIRef(namespace)
332 prefix = self.store.prefix(namespace)
/usr/local/lib/python2.7/site-packages/rdflib/namespace.pyc in split_uri(uri)
500 return (ns, ln)
501 break
--> 502 raise Exception("Can't split '%s'" % uri)
Exception: Can't split 'http://example.org/1'
I have installed Anaconda3-4.2.0 for Windows (64 bit) and nltk-3.2.1. While i am running the following code in Jupyter Notebook
`para = "Hello World. It's good to see you. Thanks for buying this book."
import nltk.data tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle') tokenizer.tokenize(para)'
I am getting the following error:
'OSError Traceback (most recent call last)
<ipython-input-1-a87e01558cc4> in <module>()
1 para = "Hello World. It's good to see you. Thanks for buying this book."
2 import nltk.data
----> 3 tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
4 tokenizer.tokenize(para)
C:\Anaconda3\lib\site-packages\nltk\data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
799
800 # Load the resource.
--> 801 opened_resource = _open(resource_url)
802
803 if format == 'raw':
C:\Anaconda3\lib\site-packages\nltk\data.py in _open(resource_url)
917
918 if protocol is None or protocol.lower() == 'nltk':
--> 919 return find(path_, path + ['']).open()
920 elif protocol.lower() == 'file':
921 # urllib might not use mode='rb', so handle this one ourselves:
C:\Anaconda3\lib\site-packages\nltk\data.py in find(resource_name, paths)
607 return GzipFileSystemPathPointer(p)
608 else:
--> 609 return FileSystemPathPointer(p)
610 else:
611 p = os.path.join(path_, url2pathname(zipfile))
C:\Anaconda3\lib\site-packages\nltk\compat.py in _decorator(*args, **kwargs)
559 def _decorator(*args, **kwargs):
560 args = (args[0], add_py3_data(args[1])) + args[2:]
--> 561 return init_func(*args, **kwargs)
562 return wraps(init_func)(_decorator)
563
C:\Anaconda3\lib\site-packages\nltk\data.py in __init__(self, _path)
298 _path = os.path.abspath(_path)
299 if not os.path.exists(_path):
--> 300 raise IOError('No such file or directory: %r' % _path)
301 self._path = _path
302
OSError: No such file or directory: 'C:\\nltk_data\\tokenizers\\punkt\\PY3\\PY3\\english.pickle'`
I have downloaded punktword tokenizer in nltk. Why I am seeing this error?Please give me an answer.
It seems tokenizers/punkt/PY3/english.pickle file not exists. You need check it.
NLTK can download pickle file use download function:
import nltk
nltk.download()