In [1]:
%env TRANSFORMERS_VERBOSITY=error
env: TRANSFORMERS_VERBOSITY=error
In [2]:
%%time
import gc
import sys
from pathlib import Path
from tqdm.notebook import tqdm, tqdm_notebook
from datasets import load_dataset, Dataset
from dataclasses import dataclass
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from datasets.utils.logging import enable_progress_bar
enable_progress_bar()
CPU times: user 478 ms, sys: 1.05 s, total: 1.53 s Wall time: 1.14 s
In [3]:
%%time
from datasets import load_from_disk
carolina_ds = load_from_disk('datasets/carolina-1.2.1-b/paragraphs-deduplicated', keep_in_memory=False)
print(carolina_ds)
Dataset({ features: ['text', 'src'], num_rows: 28062069 }) CPU times: user 269 ms, sys: 2.64 s, total: 2.91 s Wall time: 32.7 s
/home/mir/miniconda3/envs/tcc/lib/python3.11/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by mode='default'. table = cls._concat_blocks(blocks, axis=0)
In [4]:
def tokenize(model_name):
print(f"Loading model {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name)
max_length = model.config.max_position_embeddings
model = None
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True, max_length=max_length)
flag = True
def preprocess(batch):
nonlocal flag
if not flag:
flag = True
print()
out = tokenizer(batch["text"])
for k in out.keys():
batch[k] = out[k]
return batch
return carolina_ds.map(preprocess, batched=True)
In [5]:
%%time
bertimbau_ds = tokenize('neuralmind/bert-base-portuguese-cased')
Loading model neuralmind/bert-base-portuguese-cased CPU times: user 3.18 s, sys: 1.32 s, total: 4.5 s Wall time: 6.1 s
In [6]:
%%time
bertimbau_ds.save_to_disk('datasets/carolina-1.2.1-b-tokenized/bertimbau')
CPU times: user 10.3 s, sys: 10.7 s, total: 21 s Wall time: 3min 5s
In [7]:
bert_ds = tokenize('bert-base-multilingual-cased')
Loading model bert-base-multilingual-cased
In [ ]:
%%time
bert_ds.save_to_disk('datasets/carolina-1.2.1-b-tokenized/bert')
In [ ]: