In [1]:
%env TRANSFORMERS_VERBOSITY=error
env: TRANSFORMERS_VERBOSITY=error
In [2]:
%%time
import gc
import sys
from pathlib import Path
from tqdm.notebook import tqdm, tqdm_notebook
from datasets import load_dataset, Dataset
from dataclasses import dataclass
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from datasets.utils.logging import enable_progress_bar

enable_progress_bar()
CPU times: user 478 ms, sys: 1.05 s, total: 1.53 s
Wall time: 1.14 s
In [3]:
%%time
from datasets import load_from_disk
carolina_ds = load_from_disk('datasets/carolina-1.2.1-b/paragraphs-deduplicated', keep_in_memory=False)
print(carolina_ds)
Dataset({
    features: ['text', 'src'],
    num_rows: 28062069
})
CPU times: user 269 ms, sys: 2.64 s, total: 2.91 s
Wall time: 32.7 s
/home/mir/miniconda3/envs/tcc/lib/python3.11/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by mode='default'.
  table = cls._concat_blocks(blocks, axis=0)
In [4]:
def tokenize(model_name):
    print(f"Loading model {model_name}")
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    max_length = model.config.max_position_embeddings
    model = None
    tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True, max_length=max_length)

    flag = True
    def preprocess(batch):
        nonlocal flag
        if not flag:
            flag = True
            print()
        out = tokenizer(batch["text"])
        for k in out.keys():
            batch[k] = out[k]
        return batch

    return carolina_ds.map(preprocess, batched=True)
In [5]:
%%time
bertimbau_ds = tokenize('neuralmind/bert-base-portuguese-cased')
Loading model neuralmind/bert-base-portuguese-cased
CPU times: user 3.18 s, sys: 1.32 s, total: 4.5 s
Wall time: 6.1 s
In [6]:
%%time
bertimbau_ds.save_to_disk('datasets/carolina-1.2.1-b-tokenized/bertimbau')
CPU times: user 10.3 s, sys: 10.7 s, total: 21 s
Wall time: 3min 5s
In [7]:
bert_ds = tokenize('bert-base-multilingual-cased')
Loading model bert-base-multilingual-cased
In [ ]:
%%time
bert_ds.save_to_disk('datasets/carolina-1.2.1-b-tokenized/bert')
In [ ]: