runing dataset.map, it raises TypeError: can't pickle Tokenizer objects
See original GitHub issueI load squad dataset. Then want to process data use following function with Huggingface Transformers LongformerTokenizer
.
def convert_to_features(example):
# Tokenize contexts and questions (as pairs of inputs)
input_pairs = [example['question'], example['context']]
encodings = tokenizer.encode_plus(input_pairs, pad_to_max_length=True, max_length=512)
context_encodings = tokenizer.encode_plus(example['context'])
# Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
# this will give us the position of answer span in the context text
start_idx, end_idx = get_correct_alignement(example['context'], example['answers'])
start_positions_context = context_encodings.char_to_token(start_idx)
end_positions_context = context_encodings.char_to_token(end_idx-1)
# here we will compute the start and end position of the answer in the whole example
# as the example is encoded like this <s> question</s></s> context</s>
# and we know the postion of the answer in the context
# we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
# this will give us the position of the answer span in whole example
sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id)
start_positions = start_positions_context + sep_idx + 1
end_positions = end_positions_context + sep_idx + 1
if end_positions > 512:
start_positions, end_positions = 0, 0
encodings.update({'start_positions': start_positions,
'end_positions': end_positions,
'attention_mask': encodings['attention_mask']})
return encodings
Then I run dataset.map(convert_to_features)
, it raise
In [59]: a.map(convert_to_features)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-59-c453b508761d> in <module>
----> 1 a.map(convert_to_features)
/opt/conda/lib/python3.7/site-packages/datasets/arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint)
1242 fn_kwargs=fn_kwargs,
1243 new_fingerprint=new_fingerprint,
-> 1244 update_data=update_data,
1245 )
1246 else:
/opt/conda/lib/python3.7/site-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
151 "output_all_columns": self._output_all_columns,
152 }
--> 153 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
154 if new_format["columns"] is not None:
155 new_format["columns"] = list(set(new_format["columns"]) & set(out.column_names))
/opt/conda/lib/python3.7/site-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
156 kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name
157 kwargs[fingerprint_name] = update_fingerprint(
--> 158 self._fingerprint, transform, kwargs_for_fingerprint
159 )
160
/opt/conda/lib/python3.7/site-packages/datasets/fingerprint.py in update_fingerprint(fingerprint, transform, transform_args)
103 for key in sorted(transform_args):
104 hasher.update(key)
--> 105 hasher.update(transform_args[key])
106 return hasher.hexdigest()
107
/opt/conda/lib/python3.7/site-packages/datasets/fingerprint.py in update(self, value)
55 def update(self, value):
56 self.m.update(f"=={type(value)}==".encode("utf8"))
---> 57 self.m.update(self.hash(value).encode("utf-8"))
58
59 def hexdigest(self):
/opt/conda/lib/python3.7/site-packages/datasets/fingerprint.py in hash(cls, value)
51 return cls.dispatch[type(value)](cls, value)
52 else:
---> 53 return cls.hash_default(value)
54
55 def update(self, value):
/opt/conda/lib/python3.7/site-packages/datasets/fingerprint.py in hash_default(cls, value)
44 @classmethod
45 def hash_default(cls, value):
---> 46 return cls.hash_bytes(dumps(value))
47
48 @classmethod
/opt/conda/lib/python3.7/site-packages/datasets/utils/py_utils.py in dumps(obj)
365 file = StringIO()
366 with _no_cache_fields(obj):
--> 367 dump(obj, file)
368 return file.getvalue()
369
/opt/conda/lib/python3.7/site-packages/datasets/utils/py_utils.py in dump(obj, file)
337 def dump(obj, file):
338 """pickle an object to a file"""
--> 339 Pickler(file, recurse=True).dump(obj)
340 return
341
/opt/conda/lib/python3.7/site-packages/dill/_dill.py in dump(self, obj)
444 raise PicklingError(msg)
445 else:
--> 446 StockPickler.dump(self, obj)
447 stack.clear() # clear record of 'recursion-sensitive' pickled objects
448 return
/opt/conda/lib/python3.7/pickle.py in dump(self, obj)
435 if self.proto >= 4:
436 self.framer.start_framing()
--> 437 self.save(obj)
438 self.write(STOP)
439 self.framer.end_framing()
/opt/conda/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
502 f = self.dispatch.get(t)
503 if f is not None:
--> 504 f(self, obj) # Call unbound method with explicit self
505 return
506
/opt/conda/lib/python3.7/site-packages/dill/_dill.py in save_function(pickler, obj)
1436 globs, obj.__name__,
1437 obj.__defaults__, obj.__closure__,
-> 1438 obj.__dict__, fkwdefaults), obj=obj)
1439 else:
1440 _super = ('super' in getattr(obj.func_code,'co_names',())) and (_byref is not None) and getattr(pickler, '_recurse', False)
/opt/conda/lib/python3.7/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
636 else:
637 save(func)
--> 638 save(args)
639 write(REDUCE)
640
/opt/conda/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
502 f = self.dispatch.get(t)
503 if f is not None:
--> 504 f(self, obj) # Call unbound method with explicit self
505 return
506
/opt/conda/lib/python3.7/pickle.py in save_tuple(self, obj)
787 write(MARK)
788 for element in obj:
--> 789 save(element)
790
791 if id(obj) in memo:
/opt/conda/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
502 f = self.dispatch.get(t)
503 if f is not None:
--> 504 f(self, obj) # Call unbound method with explicit self
505 return
506
/opt/conda/lib/python3.7/site-packages/dill/_dill.py in save_module_dict(pickler, obj)
931 # we only care about session the first pass thru
932 pickler._session = False
--> 933 StockPickler.save_dict(pickler, obj)
934 log.info("# D2")
935 return
/opt/conda/lib/python3.7/pickle.py in save_dict(self, obj)
857
858 self.memoize(obj)
--> 859 self._batch_setitems(obj.items())
860
861 dispatch[dict] = save_dict
/opt/conda/lib/python3.7/pickle.py in _batch_setitems(self, items)
883 for k, v in tmp:
884 save(k)
--> 885 save(v)
886 write(SETITEMS)
887 elif n:
/opt/conda/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
547
548 # Save the reduce() output and finally memoize the object
--> 549 self.save_reduce(obj=obj, *rv)
550
551 def persistent_id(self, obj):
/opt/conda/lib/python3.7/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
660
661 if state is not None:
--> 662 save(state)
663 write(BUILD)
664
/opt/conda/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
502 f = self.dispatch.get(t)
503 if f is not None:
--> 504 f(self, obj) # Call unbound method with explicit self
505 return
506
/opt/conda/lib/python3.7/site-packages/dill/_dill.py in save_module_dict(pickler, obj)
931 # we only care about session the first pass thru
932 pickler._session = False
--> 933 StockPickler.save_dict(pickler, obj)
934 log.info("# D2")
935 return
/opt/conda/lib/python3.7/pickle.py in save_dict(self, obj)
857
858 self.memoize(obj)
--> 859 self._batch_setitems(obj.items())
860
861 dispatch[dict] = save_dict
/opt/conda/lib/python3.7/pickle.py in _batch_setitems(self, items)
883 for k, v in tmp:
884 save(k)
--> 885 save(v)
886 write(SETITEMS)
887 elif n:
/opt/conda/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
547
548 # Save the reduce() output and finally memoize the object
--> 549 self.save_reduce(obj=obj, *rv)
550
551 def persistent_id(self, obj):
/opt/conda/lib/python3.7/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
660
661 if state is not None:
--> 662 save(state)
663 write(BUILD)
664
/opt/conda/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
502 f = self.dispatch.get(t)
503 if f is not None:
--> 504 f(self, obj) # Call unbound method with explicit self
505 return
506
/opt/conda/lib/python3.7/site-packages/dill/_dill.py in save_module_dict(pickler, obj)
931 # we only care about session the first pass thru
932 pickler._session = False
--> 933 StockPickler.save_dict(pickler, obj)
934 log.info("# D2")
935 return
/opt/conda/lib/python3.7/pickle.py in save_dict(self, obj)
857
858 self.memoize(obj)
--> 859 self._batch_setitems(obj.items())
860
861 dispatch[dict] = save_dict
/opt/conda/lib/python3.7/pickle.py in _batch_setitems(self, items)
883 for k, v in tmp:
884 save(k)
--> 885 save(v)
886 write(SETITEMS)
887 elif n:
/opt/conda/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
522 reduce = getattr(obj, "__reduce_ex__", None)
523 if reduce is not None:
--> 524 rv = reduce(self.proto)
525 else:
526 reduce = getattr(obj, "__reduce__", None)
TypeError: can't pickle Tokenizer objects
Issue Analytics
- State:
- Created 3 years ago
- Reactions:1
- Comments:8 (5 by maintainers)
Top Results From Across the Web
TypeError: can't pickle Tokenizer objects when num_workers ...
I get a TypeError: can't pickle Tokenizer objects when trying to train a model that uses a PretrainedTransformerTokenizer tokenizer when ...
Read more >Python: can't pickle module objects error - Stack Overflow
I can reproduce the error message this way: import cPickle class Foo(object): def __init__(self): self.mod=cPickle foo=Foo() with ...
Read more >Why do I get this error running tokenizer? - Beginners
I am getting error after running this train_dataset = ds_train.map(tokenize) where you will find tokenize defined below along with the rest of the...
Read more >How to Use Word Embedding Layers for Deep Learning with ...
The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset.
Read more >Changelog — Python 3.11.1 documentation
gh-97779: Ensure that all Python frame objects are backed by “complete” ... TypeError was not raised when using more than one TypeVarTuple ,...
Read more >
Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start Free
Top Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
We can also update the
BertJapaneseTokenizer
intransformers
as you just shown @lhoestq to make it compatible with pickle. It will be faster than asking on fugashi 's repo and good for the other users oftransformers
as well.I’m currently working on
transformers
I’ll include it in the https://github.com/huggingface/transformers/pull/7141 PR and the next release oftransformers
.It looks like it this tokenizer is not supported unfortunately. This is because
t.word_tokenizer.mecab
is afugashi.fugashi.GenericTagger
which is not compatible with pickle nor dill.We need objects passes to
map
to be picklable for our caching system to work properly. Here it crashes because the caching system is not able to pickle the GenericTagger.> Maybe you can create an issue on fugashi 's repo and ask to make
fugashi.fugashi.GenericTagger
compatible with pickle ?What you can do in the meantime is use a picklable wrapper of the tokenizer: