// use tokenizers::decoders::DecoderWrapper;
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence};
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
// use tokenizers::pre_tokenizers::PreTokenizerWrapper;
// use tokenizers::processors::PostProcessorWrapper;
use tokenizers::{AddedToken, Result, TokenizerBuilder};
fn main() -> Result<()> {
let vocab_size: usize = 74000;
let mut trainer = BpeTrainerBuilder::new()
.show_progress(true)
.vocab_size(vocab_size)
.min_frequency(0)
.special_tokens(vec![
// If these tokens are already part of the vocabulary, it just let the Tokenizer know about them.
// If they don’t exist, the Tokenizer creates them, giving them a new id.
AddedToken::from(String::from("<s>"), true),
AddedToken::from(String::from("<pad>"), true),
AddedToken::from(String::from("</s>"), true),
AddedToken::from(String::from("<unk>"), true),
AddedToken::from(String::from("<mask>"), true),
])
.build();
let mut tokenizer = TokenizerBuilder::new()
.with_model(BPE::default())
.with_normalizer(Some(Sequence::new(vec![
Strip::new(true, true).into(),
NFC.into(),
])))
.with_pre_tokenizer(Some(ByteLevel::default()))
.with_post_processor(Some(ByteLevel::default()))
.with_decoder(Some(ByteLevel::default()))
.build()?;
let pretty = false;
tokenizer
.train_from_files(&mut trainer, vec!["./src/vocab.txt".to_string()])?
.save("tokenizer.json", pretty)?;
let encoding = tokenizer.encode("Bánh mì pa tê", false)?;
println!("{:?}", encoding.get_tokens());
Ok(())
}