BPE

 



// use tokenizers::decoders::DecoderWrapper;

use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};

use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence};

use tokenizers::pre_tokenizers::byte_level::ByteLevel;

// use tokenizers::pre_tokenizers::PreTokenizerWrapper;

// use tokenizers::processors::PostProcessorWrapper;

use tokenizers::{AddedToken, Result, TokenizerBuilder};



fn main() -> Result<()> {

    let vocab_size: usize = 74000;


    let mut trainer = BpeTrainerBuilder::new()

        .show_progress(true)

        .vocab_size(vocab_size)

        .min_frequency(0)

        .special_tokens(vec![

            // If these tokens are already part of the vocabulary, it just let the Tokenizer know about them.

            // If they don’t exist, the Tokenizer creates them, giving them a new id.

            AddedToken::from(String::from("<s>"), true),

            AddedToken::from(String::from("<pad>"), true),

            AddedToken::from(String::from("</s>"), true),

            AddedToken::from(String::from("<unk>"), true),

            AddedToken::from(String::from("<mask>"), true),

        ])

        .build();


    let mut tokenizer = TokenizerBuilder::new()

        .with_model(BPE::default())

        .with_normalizer(Some(Sequence::new(vec![

            Strip::new(true, true).into(),

            NFC.into(),

        ])))

        .with_pre_tokenizer(Some(ByteLevel::default()))

        .with_post_processor(Some(ByteLevel::default()))

        .with_decoder(Some(ByteLevel::default()))

        .build()?;


    let pretty = false;


    tokenizer

        .train_from_files(&mut trainer, vec!["./src/vocab.txt".to_string()])?

        .save("tokenizer.json", pretty)?;


    let encoding = tokenizer.encode("Bánh mì pa tê", false)?;

    println!("{:?}", encoding.get_tokens());

    Ok(())

}


Post a Comment

Comment

Previous Post Next Post
WANG !!!!!
https://s.shopee.vn/609U3II1Xf