Class SentencepieceModel.TrainerSpec

java.lang.Object
com.google.protobuf.AbstractMessageLite
com.google.protobuf.AbstractMessage
com.google.protobuf.GeneratedMessageV3
com.google.protobuf.GeneratedMessageV3.ExtendableMessage<SentencepieceModel.TrainerSpec>
com.google.genai.proto.SentencepieceModel.TrainerSpec
All Implemented Interfaces:
SentencepieceModel.TrainerSpecOrBuilder, com.google.protobuf.GeneratedMessageV3.ExtendableMessageOrBuilder<SentencepieceModel.TrainerSpec>, com.google.protobuf.Message, com.google.protobuf.MessageLite, com.google.protobuf.MessageLiteOrBuilder, com.google.protobuf.MessageOrBuilder, Serializable
Enclosing class:
SentencepieceModel

public static final class SentencepieceModel.TrainerSpec extends com.google.protobuf.GeneratedMessageV3.ExtendableMessage<SentencepieceModel.TrainerSpec> implements SentencepieceModel.TrainerSpecOrBuilder
 TrainerSpec encodes a various parameters for SentencePiece training.
 Next id: 55
 
Protobuf type com.google.genai.proto.TrainerSpec
See Also:
  • Field Details

    • INPUT_FIELD_NUMBER

      public static final int INPUT_FIELD_NUMBER
      See Also:
    • INPUT_FORMAT_FIELD_NUMBER

      public static final int INPUT_FORMAT_FIELD_NUMBER
      See Also:
    • MODEL_PREFIX_FIELD_NUMBER

      public static final int MODEL_PREFIX_FIELD_NUMBER
      See Also:
    • MODEL_TYPE_FIELD_NUMBER

      public static final int MODEL_TYPE_FIELD_NUMBER
      See Also:
    • VOCAB_SIZE_FIELD_NUMBER

      public static final int VOCAB_SIZE_FIELD_NUMBER
      See Also:
    • ACCEPT_LANGUAGE_FIELD_NUMBER

      public static final int ACCEPT_LANGUAGE_FIELD_NUMBER
      See Also:
    • SELF_TEST_SAMPLE_SIZE_FIELD_NUMBER

      public static final int SELF_TEST_SAMPLE_SIZE_FIELD_NUMBER
      See Also:
    • ENABLE_DIFFERENTIAL_PRIVACY_FIELD_NUMBER

      public static final int ENABLE_DIFFERENTIAL_PRIVACY_FIELD_NUMBER
      See Also:
    • DIFFERENTIAL_PRIVACY_NOISE_LEVEL_FIELD_NUMBER

      public static final int DIFFERENTIAL_PRIVACY_NOISE_LEVEL_FIELD_NUMBER
      See Also:
    • DIFFERENTIAL_PRIVACY_CLIPPING_THRESHOLD_FIELD_NUMBER

      public static final int DIFFERENTIAL_PRIVACY_CLIPPING_THRESHOLD_FIELD_NUMBER
      See Also:
    • CHARACTER_COVERAGE_FIELD_NUMBER

      public static final int CHARACTER_COVERAGE_FIELD_NUMBER
      See Also:
    • INPUT_SENTENCE_SIZE_FIELD_NUMBER

      public static final int INPUT_SENTENCE_SIZE_FIELD_NUMBER
      See Also:
    • SHUFFLE_INPUT_SENTENCE_FIELD_NUMBER

      public static final int SHUFFLE_INPUT_SENTENCE_FIELD_NUMBER
      See Also:
    • MINING_SENTENCE_SIZE_FIELD_NUMBER

      public static final int MINING_SENTENCE_SIZE_FIELD_NUMBER
      See Also:
    • TRAINING_SENTENCE_SIZE_FIELD_NUMBER

      public static final int TRAINING_SENTENCE_SIZE_FIELD_NUMBER
      See Also:
    • SEED_SENTENCEPIECE_SIZE_FIELD_NUMBER

      public static final int SEED_SENTENCEPIECE_SIZE_FIELD_NUMBER
      See Also:
    • SHRINKING_FACTOR_FIELD_NUMBER

      public static final int SHRINKING_FACTOR_FIELD_NUMBER
      See Also:
    • MAX_SENTENCE_LENGTH_FIELD_NUMBER

      public static final int MAX_SENTENCE_LENGTH_FIELD_NUMBER
      See Also:
    • NUM_THREADS_FIELD_NUMBER

      public static final int NUM_THREADS_FIELD_NUMBER
      See Also:
    • NUM_SUB_ITERATIONS_FIELD_NUMBER

      public static final int NUM_SUB_ITERATIONS_FIELD_NUMBER
      See Also:
    • MAX_SENTENCEPIECE_LENGTH_FIELD_NUMBER

      public static final int MAX_SENTENCEPIECE_LENGTH_FIELD_NUMBER
      See Also:
    • SPLIT_BY_UNICODE_SCRIPT_FIELD_NUMBER

      public static final int SPLIT_BY_UNICODE_SCRIPT_FIELD_NUMBER
      See Also:
    • SPLIT_BY_NUMBER_FIELD_NUMBER

      public static final int SPLIT_BY_NUMBER_FIELD_NUMBER
      See Also:
    • SPLIT_BY_WHITESPACE_FIELD_NUMBER

      public static final int SPLIT_BY_WHITESPACE_FIELD_NUMBER
      See Also:
    • TREAT_WHITESPACE_AS_SUFFIX_FIELD_NUMBER

      public static final int TREAT_WHITESPACE_AS_SUFFIX_FIELD_NUMBER
      See Also:
    • ALLOW_WHITESPACE_ONLY_PIECES_FIELD_NUMBER

      public static final int ALLOW_WHITESPACE_ONLY_PIECES_FIELD_NUMBER
      See Also:
    • SPLIT_DIGITS_FIELD_NUMBER

      public static final int SPLIT_DIGITS_FIELD_NUMBER
      See Also:
    • PRETOKENIZATION_DELIMITER_FIELD_NUMBER

      public static final int PRETOKENIZATION_DELIMITER_FIELD_NUMBER
      See Also:
    • CONTROL_SYMBOLS_FIELD_NUMBER

      public static final int CONTROL_SYMBOLS_FIELD_NUMBER
      See Also:
    • USER_DEFINED_SYMBOLS_FIELD_NUMBER

      public static final int USER_DEFINED_SYMBOLS_FIELD_NUMBER
      See Also:
    • REQUIRED_CHARS_FIELD_NUMBER

      public static final int REQUIRED_CHARS_FIELD_NUMBER
      See Also:
    • BYTE_FALLBACK_FIELD_NUMBER

      public static final int BYTE_FALLBACK_FIELD_NUMBER
      See Also:
    • VOCABULARY_OUTPUT_PIECE_SCORE_FIELD_NUMBER

      public static final int VOCABULARY_OUTPUT_PIECE_SCORE_FIELD_NUMBER
      See Also:
    • HARD_VOCAB_LIMIT_FIELD_NUMBER

      public static final int HARD_VOCAB_LIMIT_FIELD_NUMBER
      See Also:
    • USE_ALL_VOCAB_FIELD_NUMBER

      public static final int USE_ALL_VOCAB_FIELD_NUMBER
      See Also:
    • UNK_ID_FIELD_NUMBER

      public static final int UNK_ID_FIELD_NUMBER
      See Also:
    • BOS_ID_FIELD_NUMBER

      public static final int BOS_ID_FIELD_NUMBER
      See Also:
    • EOS_ID_FIELD_NUMBER

      public static final int EOS_ID_FIELD_NUMBER
      See Also:
    • PAD_ID_FIELD_NUMBER

      public static final int PAD_ID_FIELD_NUMBER
      See Also:
    • UNK_PIECE_FIELD_NUMBER

      public static final int UNK_PIECE_FIELD_NUMBER
      See Also:
    • BOS_PIECE_FIELD_NUMBER

      public static final int BOS_PIECE_FIELD_NUMBER
      See Also:
    • EOS_PIECE_FIELD_NUMBER

      public static final int EOS_PIECE_FIELD_NUMBER
      See Also:
    • PAD_PIECE_FIELD_NUMBER

      public static final int PAD_PIECE_FIELD_NUMBER
      See Also:
    • UNK_SURFACE_FIELD_NUMBER

      public static final int UNK_SURFACE_FIELD_NUMBER
      See Also:
    • TRAIN_EXTREMELY_LARGE_CORPUS_FIELD_NUMBER

      public static final int TRAIN_EXTREMELY_LARGE_CORPUS_FIELD_NUMBER
      See Also:
    • SEED_SENTENCEPIECES_FILE_FIELD_NUMBER

      public static final int SEED_SENTENCEPIECES_FILE_FIELD_NUMBER
      See Also:
    • PARSER

      @Deprecated public static final com.google.protobuf.Parser<SentencepieceModel.TrainerSpec> PARSER
      Deprecated.
  • Method Details

    • getDescriptor

      public static final com.google.protobuf.Descriptors.Descriptor getDescriptor()
    • getInputList

      public com.google.protobuf.ProtocolStringList getInputList()
      /////////////////////////////////////////////////////////////////
       General parameters
      
       Input corpus files.
        Trainer accepts the following two formats:
        A) Monolingual: plain text, one sentence per line.
        B) Bilingual:   TSV, source sentence <tab> target sentence
        When bilingual data is passed, shared vocabulary model is built.
        Note that the input file must be raw corpus, not a preprocessed corpus.
        Trainer only loads the first `input_sentence_size` sentences specified
        with this parameter.
       
      repeated string input = 1;
      Specified by:
      getInputList in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      A list containing the input.
    • getInputCount

      public int getInputCount()
      /////////////////////////////////////////////////////////////////
       General parameters
      
       Input corpus files.
        Trainer accepts the following two formats:
        A) Monolingual: plain text, one sentence per line.
        B) Bilingual:   TSV, source sentence <tab> target sentence
        When bilingual data is passed, shared vocabulary model is built.
        Note that the input file must be raw corpus, not a preprocessed corpus.
        Trainer only loads the first `input_sentence_size` sentences specified
        with this parameter.
       
      repeated string input = 1;
      Specified by:
      getInputCount in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The count of input.
    • getInput

      public String getInput(int index)
      /////////////////////////////////////////////////////////////////
       General parameters
      
       Input corpus files.
        Trainer accepts the following two formats:
        A) Monolingual: plain text, one sentence per line.
        B) Bilingual:   TSV, source sentence <tab> target sentence
        When bilingual data is passed, shared vocabulary model is built.
        Note that the input file must be raw corpus, not a preprocessed corpus.
        Trainer only loads the first `input_sentence_size` sentences specified
        with this parameter.
       
      repeated string input = 1;
      Specified by:
      getInput in interface SentencepieceModel.TrainerSpecOrBuilder
      Parameters:
      index - The index of the element to return.
      Returns:
      The input at the given index.
    • getInputBytes

      public com.google.protobuf.ByteString getInputBytes(int index)
      /////////////////////////////////////////////////////////////////
       General parameters
      
       Input corpus files.
        Trainer accepts the following two formats:
        A) Monolingual: plain text, one sentence per line.
        B) Bilingual:   TSV, source sentence <tab> target sentence
        When bilingual data is passed, shared vocabulary model is built.
        Note that the input file must be raw corpus, not a preprocessed corpus.
        Trainer only loads the first `input_sentence_size` sentences specified
        with this parameter.
       
      repeated string input = 1;
      Specified by:
      getInputBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Parameters:
      index - The index of the value to return.
      Returns:
      The bytes of the input at the given index.
    • hasInputFormat

      public boolean hasInputFormat()
       Input corpus format:
       "text": one-sentence-per-line text format (default)
       "tsv":  sentence <tab> freq
       
      optional string input_format = 7;
      Specified by:
      hasInputFormat in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the inputFormat field is set.
    • getInputFormat

      public String getInputFormat()
       Input corpus format:
       "text": one-sentence-per-line text format (default)
       "tsv":  sentence <tab> freq
       
      optional string input_format = 7;
      Specified by:
      getInputFormat in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The inputFormat.
    • getInputFormatBytes

      public com.google.protobuf.ByteString getInputFormatBytes()
       Input corpus format:
       "text": one-sentence-per-line text format (default)
       "tsv":  sentence <tab> freq
       
      optional string input_format = 7;
      Specified by:
      getInputFormatBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for inputFormat.
    • hasModelPrefix

      public boolean hasModelPrefix()
       Output model file prefix.
       <model_prefix>.model and <model_prefix>.vocab are generated.
       
      optional string model_prefix = 2;
      Specified by:
      hasModelPrefix in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the modelPrefix field is set.
    • getModelPrefix

      public String getModelPrefix()
       Output model file prefix.
       <model_prefix>.model and <model_prefix>.vocab are generated.
       
      optional string model_prefix = 2;
      Specified by:
      getModelPrefix in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The modelPrefix.
    • getModelPrefixBytes

      public com.google.protobuf.ByteString getModelPrefixBytes()
       Output model file prefix.
       <model_prefix>.model and <model_prefix>.vocab are generated.
       
      optional string model_prefix = 2;
      Specified by:
      getModelPrefixBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for modelPrefix.
    • hasModelType

      public boolean hasModelType()
      optional .com.google.genai.proto.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
      Specified by:
      hasModelType in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the modelType field is set.
    • getModelType

      optional .com.google.genai.proto.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
      Specified by:
      getModelType in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The modelType.
    • hasVocabSize

      public boolean hasVocabSize()
       Vocabulary size. 8k is the default size.
       
      optional int32 vocab_size = 4 [default = 8000];
      Specified by:
      hasVocabSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the vocabSize field is set.
    • getVocabSize

      public int getVocabSize()
       Vocabulary size. 8k is the default size.
       
      optional int32 vocab_size = 4 [default = 8000];
      Specified by:
      getVocabSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The vocabSize.
    • getAcceptLanguageList

      public com.google.protobuf.ProtocolStringList getAcceptLanguageList()
       List of the languages this model can accept.
       Since the model is language-agnostic, this field is used as a reference.
       
      repeated string accept_language = 5;
      Specified by:
      getAcceptLanguageList in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      A list containing the acceptLanguage.
    • getAcceptLanguageCount

      public int getAcceptLanguageCount()
       List of the languages this model can accept.
       Since the model is language-agnostic, this field is used as a reference.
       
      repeated string accept_language = 5;
      Specified by:
      getAcceptLanguageCount in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The count of acceptLanguage.
    • getAcceptLanguage

      public String getAcceptLanguage(int index)
       List of the languages this model can accept.
       Since the model is language-agnostic, this field is used as a reference.
       
      repeated string accept_language = 5;
      Specified by:
      getAcceptLanguage in interface SentencepieceModel.TrainerSpecOrBuilder
      Parameters:
      index - The index of the element to return.
      Returns:
      The acceptLanguage at the given index.
    • getAcceptLanguageBytes

      public com.google.protobuf.ByteString getAcceptLanguageBytes(int index)
       List of the languages this model can accept.
       Since the model is language-agnostic, this field is used as a reference.
       
      repeated string accept_language = 5;
      Specified by:
      getAcceptLanguageBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Parameters:
      index - The index of the value to return.
      Returns:
      The bytes of the acceptLanguage at the given index.
    • hasSelfTestSampleSize

      public boolean hasSelfTestSampleSize()
       Size of self-test samples, which are encoded in the model file.
       
      optional int32 self_test_sample_size = 6 [default = 0];
      Specified by:
      hasSelfTestSampleSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the selfTestSampleSize field is set.
    • getSelfTestSampleSize

      public int getSelfTestSampleSize()
       Size of self-test samples, which are encoded in the model file.
       
      optional int32 self_test_sample_size = 6 [default = 0];
      Specified by:
      getSelfTestSampleSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The selfTestSampleSize.
    • hasEnableDifferentialPrivacy

      public boolean hasEnableDifferentialPrivacy()
       Whether to use DP version of sentencepiece. Use it with TSV input format
       (requires precomputed word tab counts to work).
       
      optional bool enable_differential_privacy = 50 [default = false];
      Specified by:
      hasEnableDifferentialPrivacy in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the enableDifferentialPrivacy field is set.
    • getEnableDifferentialPrivacy

      public boolean getEnableDifferentialPrivacy()
       Whether to use DP version of sentencepiece. Use it with TSV input format
       (requires precomputed word tab counts to work).
       
      optional bool enable_differential_privacy = 50 [default = false];
      Specified by:
      getEnableDifferentialPrivacy in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The enableDifferentialPrivacy.
    • hasDifferentialPrivacyNoiseLevel

      public boolean hasDifferentialPrivacyNoiseLevel()
       Set these parameters if you need DP version of sentencepiece.
       std of noise to add.
       
      optional float differential_privacy_noise_level = 51 [default = 0];
      Specified by:
      hasDifferentialPrivacyNoiseLevel in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the differentialPrivacyNoiseLevel field is set.
    • getDifferentialPrivacyNoiseLevel

      public float getDifferentialPrivacyNoiseLevel()
       Set these parameters if you need DP version of sentencepiece.
       std of noise to add.
       
      optional float differential_privacy_noise_level = 51 [default = 0];
      Specified by:
      getDifferentialPrivacyNoiseLevel in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The differentialPrivacyNoiseLevel.
    • hasDifferentialPrivacyClippingThreshold

      public boolean hasDifferentialPrivacyClippingThreshold()
       Clipping threshold to apply after adding noise. All the words with
       frequency less than this value are dropped.
       
      optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
      Specified by:
      hasDifferentialPrivacyClippingThreshold in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the differentialPrivacyClippingThreshold field is set.
    • getDifferentialPrivacyClippingThreshold

      public long getDifferentialPrivacyClippingThreshold()
       Clipping threshold to apply after adding noise. All the words with
       frequency less than this value are dropped.
       
      optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
      Specified by:
      getDifferentialPrivacyClippingThreshold in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The differentialPrivacyClippingThreshold.
    • hasCharacterCoverage

      public boolean hasCharacterCoverage()
      /////////////////////////////////////////////////////////////////
       Training parameters.
      
       Uses characters which cover the corpus with the ratio of `chars_coverage`.
       This parameter determines the set of basic Alphabet of sentence piece.
       1.0 - `chars_coverage` characters are treated as UNK.
       See also required_chars field.
       
      optional float character_coverage = 10 [default = 0.9995];
      Specified by:
      hasCharacterCoverage in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the characterCoverage field is set.
    • getCharacterCoverage

      public float getCharacterCoverage()
      /////////////////////////////////////////////////////////////////
       Training parameters.
      
       Uses characters which cover the corpus with the ratio of `chars_coverage`.
       This parameter determines the set of basic Alphabet of sentence piece.
       1.0 - `chars_coverage` characters are treated as UNK.
       See also required_chars field.
       
      optional float character_coverage = 10 [default = 0.9995];
      Specified by:
      getCharacterCoverage in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The characterCoverage.
    • hasInputSentenceSize

      public boolean hasInputSentenceSize()
       Maximum size of sentences the trainer loads from `input` parameter.
       Trainer simply loads the `input` files in sequence.
       It is better to shuffle the input corpus randomly.
       
      optional uint64 input_sentence_size = 11 [default = 0];
      Specified by:
      hasInputSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the inputSentenceSize field is set.
    • getInputSentenceSize

      public long getInputSentenceSize()
       Maximum size of sentences the trainer loads from `input` parameter.
       Trainer simply loads the `input` files in sequence.
       It is better to shuffle the input corpus randomly.
       
      optional uint64 input_sentence_size = 11 [default = 0];
      Specified by:
      getInputSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The inputSentenceSize.
    • hasShuffleInputSentence

      public boolean hasShuffleInputSentence()
      optional bool shuffle_input_sentence = 19 [default = true];
      Specified by:
      hasShuffleInputSentence in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the shuffleInputSentence field is set.
    • getShuffleInputSentence

      public boolean getShuffleInputSentence()
      optional bool shuffle_input_sentence = 19 [default = true];
      Specified by:
      getShuffleInputSentence in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The shuffleInputSentence.
    • hasMiningSentenceSize

      @Deprecated public boolean hasMiningSentenceSize()
      Deprecated.
      com.google.genai.proto.TrainerSpec.mining_sentence_size is deprecated. See sentencepiece_model.proto;l=96
       Maximum size of sentences to make seed sentence pieces.
       Extended suffix array is constructed to extract frequent
       sub-strings from the corpus. This uses 20N working space,
       where N is the size of corpus.
       
      optional int32 mining_sentence_size = 12 [deprecated = true];
      Specified by:
      hasMiningSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the miningSentenceSize field is set.
    • getMiningSentenceSize

      @Deprecated public int getMiningSentenceSize()
      Deprecated.
      com.google.genai.proto.TrainerSpec.mining_sentence_size is deprecated. See sentencepiece_model.proto;l=96
       Maximum size of sentences to make seed sentence pieces.
       Extended suffix array is constructed to extract frequent
       sub-strings from the corpus. This uses 20N working space,
       where N is the size of corpus.
       
      optional int32 mining_sentence_size = 12 [deprecated = true];
      Specified by:
      getMiningSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The miningSentenceSize.
    • hasTrainingSentenceSize

      @Deprecated public boolean hasTrainingSentenceSize()
      Deprecated.
      com.google.genai.proto.TrainerSpec.training_sentence_size is deprecated. See sentencepiece_model.proto;l=99
       Maximum size of sentences to train sentence pieces.
       
      optional int32 training_sentence_size = 13 [deprecated = true];
      Specified by:
      hasTrainingSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the trainingSentenceSize field is set.
    • getTrainingSentenceSize

      @Deprecated public int getTrainingSentenceSize()
      Deprecated.
      com.google.genai.proto.TrainerSpec.training_sentence_size is deprecated. See sentencepiece_model.proto;l=99
       Maximum size of sentences to train sentence pieces.
       
      optional int32 training_sentence_size = 13 [deprecated = true];
      Specified by:
      getTrainingSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The trainingSentenceSize.
    • hasSeedSentencepieceSize

      public boolean hasSeedSentencepieceSize()
       The size of seed sentencepieces.
       `seed_sentencepiece_size` must be larger than `vocab_size`.
       
      optional int32 seed_sentencepiece_size = 14 [default = 1000000];
      Specified by:
      hasSeedSentencepieceSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the seedSentencepieceSize field is set.
    • getSeedSentencepieceSize

      public int getSeedSentencepieceSize()
       The size of seed sentencepieces.
       `seed_sentencepiece_size` must be larger than `vocab_size`.
       
      optional int32 seed_sentencepiece_size = 14 [default = 1000000];
      Specified by:
      getSeedSentencepieceSize in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The seedSentencepieceSize.
    • hasShrinkingFactor

      public boolean hasShrinkingFactor()
       In every EM sub-iterations, keeps top
       `shrinking_factor` * `current sentencepieces size` with respect to
       the loss of the sentence piece. This value should be smaller than 1.0.
       
      optional float shrinking_factor = 15 [default = 0.75];
      Specified by:
      hasShrinkingFactor in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the shrinkingFactor field is set.
    • getShrinkingFactor

      public float getShrinkingFactor()
       In every EM sub-iterations, keeps top
       `shrinking_factor` * `current sentencepieces size` with respect to
       the loss of the sentence piece. This value should be smaller than 1.0.
       
      optional float shrinking_factor = 15 [default = 0.75];
      Specified by:
      getShrinkingFactor in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The shrinkingFactor.
    • hasMaxSentenceLength

      public boolean hasMaxSentenceLength()
       The maximum sentence length in byte. The sentences with the length
       larger than `max_sentence_length` is simply ignored.
       Longer input tends to bring the following risks:
        * Overflow during EM training (unigram language model only)
        * Performance drop because of O(n log n) cost in BPE.
       
      optional int32 max_sentence_length = 18 [default = 4192];
      Specified by:
      hasMaxSentenceLength in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the maxSentenceLength field is set.
    • getMaxSentenceLength

      public int getMaxSentenceLength()
       The maximum sentence length in byte. The sentences with the length
       larger than `max_sentence_length` is simply ignored.
       Longer input tends to bring the following risks:
        * Overflow during EM training (unigram language model only)
        * Performance drop because of O(n log n) cost in BPE.
       
      optional int32 max_sentence_length = 18 [default = 4192];
      Specified by:
      getMaxSentenceLength in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The maxSentenceLength.
    • hasNumThreads

      public boolean hasNumThreads()
       Number of threads in the training.
       
      optional int32 num_threads = 16 [default = 16];
      Specified by:
      hasNumThreads in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the numThreads field is set.
    • getNumThreads

      public int getNumThreads()
       Number of threads in the training.
       
      optional int32 num_threads = 16 [default = 16];
      Specified by:
      getNumThreads in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The numThreads.
    • hasNumSubIterations

      public boolean hasNumSubIterations()
       Number of EM sub iterations.
       
      optional int32 num_sub_iterations = 17 [default = 2];
      Specified by:
      hasNumSubIterations in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the numSubIterations field is set.
    • getNumSubIterations

      public int getNumSubIterations()
       Number of EM sub iterations.
       
      optional int32 num_sub_iterations = 17 [default = 2];
      Specified by:
      getNumSubIterations in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The numSubIterations.
    • hasMaxSentencepieceLength

      public boolean hasMaxSentencepieceLength()
      /////////////////////////////////////////////////////////////////
       SentencePiece parameters which control the shapes of sentence piece.
      
       Maximum length of sentencepiece.
       
      optional int32 max_sentencepiece_length = 20 [default = 16];
      Specified by:
      hasMaxSentencepieceLength in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the maxSentencepieceLength field is set.
    • getMaxSentencepieceLength

      public int getMaxSentencepieceLength()
      /////////////////////////////////////////////////////////////////
       SentencePiece parameters which control the shapes of sentence piece.
      
       Maximum length of sentencepiece.
       
      optional int32 max_sentencepiece_length = 20 [default = 16];
      Specified by:
      getMaxSentencepieceLength in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The maxSentencepieceLength.
    • hasSplitByUnicodeScript

      public boolean hasSplitByUnicodeScript()
       Uses Unicode script to split sentence pieces.
       When `split_by_unicode_script` is true, we do not allow sentence piece to
       include multiple Unicode scripts, e.g. "F1" is not a valid piece.
       Exception: CJ characters (Hiragana/Katakana/Han) are all handled
       as one script type, since Japanese word can consist of multiple scripts.
       This exception is always applied regardless of the accept-language
       parameter.
       
      optional bool split_by_unicode_script = 21 [default = true];
      Specified by:
      hasSplitByUnicodeScript in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the splitByUnicodeScript field is set.
    • getSplitByUnicodeScript

      public boolean getSplitByUnicodeScript()
       Uses Unicode script to split sentence pieces.
       When `split_by_unicode_script` is true, we do not allow sentence piece to
       include multiple Unicode scripts, e.g. "F1" is not a valid piece.
       Exception: CJ characters (Hiragana/Katakana/Han) are all handled
       as one script type, since Japanese word can consist of multiple scripts.
       This exception is always applied regardless of the accept-language
       parameter.
       
      optional bool split_by_unicode_script = 21 [default = true];
      Specified by:
      getSplitByUnicodeScript in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The splitByUnicodeScript.
    • hasSplitByNumber

      public boolean hasSplitByNumber()
       When `split_by_number` is true, put a boundary between number and
       non-number transition. If we want to treat "F1" is one token, set this flag
       to be false.
       
      optional bool split_by_number = 23 [default = true];
      Specified by:
      hasSplitByNumber in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the splitByNumber field is set.
    • getSplitByNumber

      public boolean getSplitByNumber()
       When `split_by_number` is true, put a boundary between number and
       non-number transition. If we want to treat "F1" is one token, set this flag
       to be false.
       
      optional bool split_by_number = 23 [default = true];
      Specified by:
      getSplitByNumber in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The splitByNumber.
    • hasSplitByWhitespace

      public boolean hasSplitByWhitespace()
       Use a white space to split sentence pieces.
       When `split_by_whitespace` is false, we may have the piece containing
       a white space in the middle. e.g., "in_the".
       
      optional bool split_by_whitespace = 22 [default = true];
      Specified by:
      hasSplitByWhitespace in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the splitByWhitespace field is set.
    • getSplitByWhitespace

      public boolean getSplitByWhitespace()
       Use a white space to split sentence pieces.
       When `split_by_whitespace` is false, we may have the piece containing
       a white space in the middle. e.g., "in_the".
       
      optional bool split_by_whitespace = 22 [default = true];
      Specified by:
      getSplitByWhitespace in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The splitByWhitespace.
    • hasTreatWhitespaceAsSuffix

      public boolean hasTreatWhitespaceAsSuffix()
       Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
       hello_. When `treat_whitespace_as_suffix` is true,
       NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
       of sentence.
       
      optional bool treat_whitespace_as_suffix = 24 [default = false];
      Specified by:
      hasTreatWhitespaceAsSuffix in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the treatWhitespaceAsSuffix field is set.
    • getTreatWhitespaceAsSuffix

      public boolean getTreatWhitespaceAsSuffix()
       Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
       hello_. When `treat_whitespace_as_suffix` is true,
       NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
       of sentence.
       
      optional bool treat_whitespace_as_suffix = 24 [default = false];
      Specified by:
      getTreatWhitespaceAsSuffix in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The treatWhitespaceAsSuffix.
    • hasAllowWhitespaceOnlyPieces

      public boolean hasAllowWhitespaceOnlyPieces()
       Allows pieces that only contain whitespaces instead of appearing only as
       prefix or suffix of other pieces.
       
      optional bool allow_whitespace_only_pieces = 26 [default = false];
      Specified by:
      hasAllowWhitespaceOnlyPieces in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the allowWhitespaceOnlyPieces field is set.
    • getAllowWhitespaceOnlyPieces

      public boolean getAllowWhitespaceOnlyPieces()
       Allows pieces that only contain whitespaces instead of appearing only as
       prefix or suffix of other pieces.
       
      optional bool allow_whitespace_only_pieces = 26 [default = false];
      Specified by:
      getAllowWhitespaceOnlyPieces in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The allowWhitespaceOnlyPieces.
    • hasSplitDigits

      public boolean hasSplitDigits()
       Split all digits (0-9) into separate pieces.
       
      optional bool split_digits = 25 [default = false];
      Specified by:
      hasSplitDigits in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the splitDigits field is set.
    • getSplitDigits

      public boolean getSplitDigits()
       Split all digits (0-9) into separate pieces.
       
      optional bool split_digits = 25 [default = false];
      Specified by:
      getSplitDigits in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The splitDigits.
    • hasPretokenizationDelimiter

      public boolean hasPretokenizationDelimiter()
       Defines the pre-tokenization delimiter.
       When specified, no pieces crossing this delimiter is not included
       in the vocab. Then the delimiter string is virtually ignored
       during the training. This field can allows constraints on the vocabulary
       selection. Note that this field is available on unigram mode.
       
      optional string pretokenization_delimiter = 53 [default = ""];
      Specified by:
      hasPretokenizationDelimiter in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the pretokenizationDelimiter field is set.
    • getPretokenizationDelimiter

      public String getPretokenizationDelimiter()
       Defines the pre-tokenization delimiter.
       When specified, no pieces crossing this delimiter is not included
       in the vocab. Then the delimiter string is virtually ignored
       during the training. This field can allows constraints on the vocabulary
       selection. Note that this field is available on unigram mode.
       
      optional string pretokenization_delimiter = 53 [default = ""];
      Specified by:
      getPretokenizationDelimiter in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The pretokenizationDelimiter.
    • getPretokenizationDelimiterBytes

      public com.google.protobuf.ByteString getPretokenizationDelimiterBytes()
       Defines the pre-tokenization delimiter.
       When specified, no pieces crossing this delimiter is not included
       in the vocab. Then the delimiter string is virtually ignored
       during the training. This field can allows constraints on the vocabulary
       selection. Note that this field is available on unigram mode.
       
      optional string pretokenization_delimiter = 53 [default = ""];
      Specified by:
      getPretokenizationDelimiterBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for pretokenizationDelimiter.
    • getControlSymbolsList

      public com.google.protobuf.ProtocolStringList getControlSymbolsList()
      /////////////////////////////////////////////////////////////////
       Vocabulary management
      
       Defines control symbols used as an indicator to
       change the behavior of the decoder. <s> and </s> are pre-defined.
       We can use this field to encode various meta information,
       including language indicator in multilingual model.
       These symbols are not visible to users, but visible to
       the decoder. Note that when the input sentence contains control symbols,
       they are not treated as one token, but segmented into normal pieces.
       Control symbols must be inserted independently from the segmentation.
       
      repeated string control_symbols = 30;
      Specified by:
      getControlSymbolsList in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      A list containing the controlSymbols.
    • getControlSymbolsCount

      public int getControlSymbolsCount()
      /////////////////////////////////////////////////////////////////
       Vocabulary management
      
       Defines control symbols used as an indicator to
       change the behavior of the decoder. <s> and </s> are pre-defined.
       We can use this field to encode various meta information,
       including language indicator in multilingual model.
       These symbols are not visible to users, but visible to
       the decoder. Note that when the input sentence contains control symbols,
       they are not treated as one token, but segmented into normal pieces.
       Control symbols must be inserted independently from the segmentation.
       
      repeated string control_symbols = 30;
      Specified by:
      getControlSymbolsCount in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The count of controlSymbols.
    • getControlSymbols

      public String getControlSymbols(int index)
      /////////////////////////////////////////////////////////////////
       Vocabulary management
      
       Defines control symbols used as an indicator to
       change the behavior of the decoder. <s> and </s> are pre-defined.
       We can use this field to encode various meta information,
       including language indicator in multilingual model.
       These symbols are not visible to users, but visible to
       the decoder. Note that when the input sentence contains control symbols,
       they are not treated as one token, but segmented into normal pieces.
       Control symbols must be inserted independently from the segmentation.
       
      repeated string control_symbols = 30;
      Specified by:
      getControlSymbols in interface SentencepieceModel.TrainerSpecOrBuilder
      Parameters:
      index - The index of the element to return.
      Returns:
      The controlSymbols at the given index.
    • getControlSymbolsBytes

      public com.google.protobuf.ByteString getControlSymbolsBytes(int index)
      /////////////////////////////////////////////////////////////////
       Vocabulary management
      
       Defines control symbols used as an indicator to
       change the behavior of the decoder. <s> and </s> are pre-defined.
       We can use this field to encode various meta information,
       including language indicator in multilingual model.
       These symbols are not visible to users, but visible to
       the decoder. Note that when the input sentence contains control symbols,
       they are not treated as one token, but segmented into normal pieces.
       Control symbols must be inserted independently from the segmentation.
       
      repeated string control_symbols = 30;
      Specified by:
      getControlSymbolsBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Parameters:
      index - The index of the value to return.
      Returns:
      The bytes of the controlSymbols at the given index.
    • getUserDefinedSymbolsList

      public com.google.protobuf.ProtocolStringList getUserDefinedSymbolsList()
       Defines user defined symbols.
       These symbols are added with extremely high score
       so they are always treated as one unique symbol in any context.
       Typical usage of user_defined_symbols is placeholder for named entities.
       
      repeated string user_defined_symbols = 31;
      Specified by:
      getUserDefinedSymbolsList in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      A list containing the userDefinedSymbols.
    • getUserDefinedSymbolsCount

      public int getUserDefinedSymbolsCount()
       Defines user defined symbols.
       These symbols are added with extremely high score
       so they are always treated as one unique symbol in any context.
       Typical usage of user_defined_symbols is placeholder for named entities.
       
      repeated string user_defined_symbols = 31;
      Specified by:
      getUserDefinedSymbolsCount in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The count of userDefinedSymbols.
    • getUserDefinedSymbols

      public String getUserDefinedSymbols(int index)
       Defines user defined symbols.
       These symbols are added with extremely high score
       so they are always treated as one unique symbol in any context.
       Typical usage of user_defined_symbols is placeholder for named entities.
       
      repeated string user_defined_symbols = 31;
      Specified by:
      getUserDefinedSymbols in interface SentencepieceModel.TrainerSpecOrBuilder
      Parameters:
      index - The index of the element to return.
      Returns:
      The userDefinedSymbols at the given index.
    • getUserDefinedSymbolsBytes

      public com.google.protobuf.ByteString getUserDefinedSymbolsBytes(int index)
       Defines user defined symbols.
       These symbols are added with extremely high score
       so they are always treated as one unique symbol in any context.
       Typical usage of user_defined_symbols is placeholder for named entities.
       
      repeated string user_defined_symbols = 31;
      Specified by:
      getUserDefinedSymbolsBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Parameters:
      index - The index of the value to return.
      Returns:
      The bytes of the userDefinedSymbols at the given index.
    • hasRequiredChars

      public boolean hasRequiredChars()
       Defines required characters. Each UTF8 character in this string is included
       in the character set regardless of character_coverage value. Unlike
       user_defined_symbols, these characters have scores based on the frequency
       on input sentences, and the model can form subwords using characters
       in this field.
       
      optional string required_chars = 36;
      Specified by:
      hasRequiredChars in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the requiredChars field is set.
    • getRequiredChars

      public String getRequiredChars()
       Defines required characters. Each UTF8 character in this string is included
       in the character set regardless of character_coverage value. Unlike
       user_defined_symbols, these characters have scores based on the frequency
       on input sentences, and the model can form subwords using characters
       in this field.
       
      optional string required_chars = 36;
      Specified by:
      getRequiredChars in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The requiredChars.
    • getRequiredCharsBytes

      public com.google.protobuf.ByteString getRequiredCharsBytes()
       Defines required characters. Each UTF8 character in this string is included
       in the character set regardless of character_coverage value. Unlike
       user_defined_symbols, these characters have scores based on the frequency
       on input sentences, and the model can form subwords using characters
       in this field.
       
      optional string required_chars = 36;
      Specified by:
      getRequiredCharsBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for requiredChars.
    • hasByteFallback

      public boolean hasByteFallback()
       Decomposes unknown pieces into UTF-8 bytes.
       
      optional bool byte_fallback = 35 [default = false];
      Specified by:
      hasByteFallback in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the byteFallback field is set.
    • getByteFallback

      public boolean getByteFallback()
       Decomposes unknown pieces into UTF-8 bytes.
       
      optional bool byte_fallback = 35 [default = false];
      Specified by:
      getByteFallback in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The byteFallback.
    • hasVocabularyOutputPieceScore

      public boolean hasVocabularyOutputPieceScore()
       When creating the vocabulary file, defines whether or not to additionally
       output the score for each piece.
       
      optional bool vocabulary_output_piece_score = 32 [default = true];
      Specified by:
      hasVocabularyOutputPieceScore in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the vocabularyOutputPieceScore field is set.
    • getVocabularyOutputPieceScore

      public boolean getVocabularyOutputPieceScore()
       When creating the vocabulary file, defines whether or not to additionally
       output the score for each piece.
       
      optional bool vocabulary_output_piece_score = 32 [default = true];
      Specified by:
      getVocabularyOutputPieceScore in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The vocabularyOutputPieceScore.
    • hasHardVocabLimit

      public boolean hasHardVocabLimit()
       `vocab_size` is treated as hard limit. Crash if
       the model can not produce the vocab of size `vocab_size`,
       When `hard_vocab_limit` is false, vocab_size is treated
       as soft limit. Note that when model_type=char,
       always assumes hard_vocab_limit = false.
       
      optional bool hard_vocab_limit = 33 [default = true];
      Specified by:
      hasHardVocabLimit in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the hardVocabLimit field is set.
    • getHardVocabLimit

      public boolean getHardVocabLimit()
       `vocab_size` is treated as hard limit. Crash if
       the model can not produce the vocab of size `vocab_size`,
       When `hard_vocab_limit` is false, vocab_size is treated
       as soft limit. Note that when model_type=char,
       always assumes hard_vocab_limit = false.
       
      optional bool hard_vocab_limit = 33 [default = true];
      Specified by:
      getHardVocabLimit in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The hardVocabLimit.
    • hasUseAllVocab

      public boolean hasUseAllVocab()
       use all symbols for vocab extraction. This flag is valid
       if model type is either CHAR or WORD
       
      optional bool use_all_vocab = 34 [default = false];
      Specified by:
      hasUseAllVocab in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the useAllVocab field is set.
    • getUseAllVocab

      public boolean getUseAllVocab()
       use all symbols for vocab extraction. This flag is valid
       if model type is either CHAR or WORD
       
      optional bool use_all_vocab = 34 [default = false];
      Specified by:
      getUseAllVocab in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The useAllVocab.
    • hasUnkId

      public boolean hasUnkId()
      /////////////////////////////////////////////////////////////////
       Reserved special meta tokens.
       * -1 is not used.
       * unk_id must not be -1.
       Id must starts with 0 and be contiguous.
       
      optional int32 unk_id = 40 [default = 0];
      Specified by:
      hasUnkId in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the unkId field is set.
    • getUnkId

      public int getUnkId()
      /////////////////////////////////////////////////////////////////
       Reserved special meta tokens.
       * -1 is not used.
       * unk_id must not be -1.
       Id must starts with 0 and be contiguous.
       
      optional int32 unk_id = 40 [default = 0];
      Specified by:
      getUnkId in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The unkId.
    • hasBosId

      public boolean hasBosId()
       <s>
       
      optional int32 bos_id = 41 [default = 1];
      Specified by:
      hasBosId in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the bosId field is set.
    • getBosId

      public int getBosId()
       <s>
       
      optional int32 bos_id = 41 [default = 1];
      Specified by:
      getBosId in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bosId.
    • hasEosId

      public boolean hasEosId()
       </s>
       
      optional int32 eos_id = 42 [default = 2];
      Specified by:
      hasEosId in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the eosId field is set.
    • getEosId

      public int getEosId()
       </s>
       
      optional int32 eos_id = 42 [default = 2];
      Specified by:
      getEosId in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The eosId.
    • hasPadId

      public boolean hasPadId()
       <pad> (padding)
       
      optional int32 pad_id = 43 [default = -1];
      Specified by:
      hasPadId in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the padId field is set.
    • getPadId

      public int getPadId()
       <pad> (padding)
       
      optional int32 pad_id = 43 [default = -1];
      Specified by:
      getPadId in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The padId.
    • hasUnkPiece

      public boolean hasUnkPiece()
      optional string unk_piece = 45 [default = "<unk>"];
      Specified by:
      hasUnkPiece in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the unkPiece field is set.
    • getUnkPiece

      public String getUnkPiece()
      optional string unk_piece = 45 [default = "<unk>"];
      Specified by:
      getUnkPiece in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The unkPiece.
    • getUnkPieceBytes

      public com.google.protobuf.ByteString getUnkPieceBytes()
      optional string unk_piece = 45 [default = "<unk>"];
      Specified by:
      getUnkPieceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for unkPiece.
    • hasBosPiece

      public boolean hasBosPiece()
      optional string bos_piece = 46 [default = "<s>"];
      Specified by:
      hasBosPiece in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the bosPiece field is set.
    • getBosPiece

      public String getBosPiece()
      optional string bos_piece = 46 [default = "<s>"];
      Specified by:
      getBosPiece in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bosPiece.
    • getBosPieceBytes

      public com.google.protobuf.ByteString getBosPieceBytes()
      optional string bos_piece = 46 [default = "<s>"];
      Specified by:
      getBosPieceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for bosPiece.
    • hasEosPiece

      public boolean hasEosPiece()
      optional string eos_piece = 47 [default = "</s>"];
      Specified by:
      hasEosPiece in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the eosPiece field is set.
    • getEosPiece

      public String getEosPiece()
      optional string eos_piece = 47 [default = "</s>"];
      Specified by:
      getEosPiece in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The eosPiece.
    • getEosPieceBytes

      public com.google.protobuf.ByteString getEosPieceBytes()
      optional string eos_piece = 47 [default = "</s>"];
      Specified by:
      getEosPieceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for eosPiece.
    • hasPadPiece

      public boolean hasPadPiece()
      optional string pad_piece = 48 [default = "<pad>"];
      Specified by:
      hasPadPiece in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the padPiece field is set.
    • getPadPiece

      public String getPadPiece()
      optional string pad_piece = 48 [default = "<pad>"];
      Specified by:
      getPadPiece in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The padPiece.
    • getPadPieceBytes

      public com.google.protobuf.ByteString getPadPieceBytes()
      optional string pad_piece = 48 [default = "<pad>"];
      Specified by:
      getPadPieceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for padPiece.
    • hasUnkSurface

      public boolean hasUnkSurface()
       Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
       since this character can be useful both for user and
       developer. We can easily figure out that <unk> is emitted.
       
      optional string unk_surface = 44 [default = " \342\201\207 "];
      Specified by:
      hasUnkSurface in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the unkSurface field is set.
    • getUnkSurface

      public String getUnkSurface()
       Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
       since this character can be useful both for user and
       developer. We can easily figure out that <unk> is emitted.
       
      optional string unk_surface = 44 [default = " \342\201\207 "];
      Specified by:
      getUnkSurface in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The unkSurface.
    • getUnkSurfaceBytes

      public com.google.protobuf.ByteString getUnkSurfaceBytes()
       Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
       since this character can be useful both for user and
       developer. We can easily figure out that <unk> is emitted.
       
      optional string unk_surface = 44 [default = " \342\201\207 "];
      Specified by:
      getUnkSurfaceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for unkSurface.
    • hasTrainExtremelyLargeCorpus

      public boolean hasTrainExtremelyLargeCorpus()
       Increase bit depth to allow unigram model training on large
       (>10M sentences) corpora. A Side-effect of enabling this flag
       is increased memory usage.
       
      optional bool train_extremely_large_corpus = 49 [default = false];
      Specified by:
      hasTrainExtremelyLargeCorpus in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the trainExtremelyLargeCorpus field is set.
    • getTrainExtremelyLargeCorpus

      public boolean getTrainExtremelyLargeCorpus()
       Increase bit depth to allow unigram model training on large
       (>10M sentences) corpora. A Side-effect of enabling this flag
       is increased memory usage.
       
      optional bool train_extremely_large_corpus = 49 [default = false];
      Specified by:
      getTrainExtremelyLargeCorpus in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The trainExtremelyLargeCorpus.
    • hasSeedSentencepiecesFile

      public boolean hasSeedSentencepiecesFile()
       Path to a seed sentencepieces file, with one tab-separated
       seed sentencepiece <tab> frequency per line.
       
      optional string seed_sentencepieces_file = 54 [default = ""];
      Specified by:
      hasSeedSentencepiecesFile in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      Whether the seedSentencepiecesFile field is set.
    • getSeedSentencepiecesFile

      public String getSeedSentencepiecesFile()
       Path to a seed sentencepieces file, with one tab-separated
       seed sentencepiece <tab> frequency per line.
       
      optional string seed_sentencepieces_file = 54 [default = ""];
      Specified by:
      getSeedSentencepiecesFile in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The seedSentencepiecesFile.
    • getSeedSentencepiecesFileBytes

      public com.google.protobuf.ByteString getSeedSentencepiecesFileBytes()
       Path to a seed sentencepieces file, with one tab-separated
       seed sentencepiece <tab> frequency per line.
       
      optional string seed_sentencepieces_file = 54 [default = ""];
      Specified by:
      getSeedSentencepiecesFileBytes in interface SentencepieceModel.TrainerSpecOrBuilder
      Returns:
      The bytes for seedSentencepiecesFile.
    • isInitialized

      public final boolean isInitialized()
      Specified by:
      isInitialized in interface com.google.protobuf.MessageLiteOrBuilder
      Overrides:
      isInitialized in class com.google.protobuf.GeneratedMessageV3.ExtendableMessage<SentencepieceModel.TrainerSpec>
    • writeTo

      public void writeTo(com.google.protobuf.CodedOutputStream output) throws IOException
      Specified by:
      writeTo in interface com.google.protobuf.MessageLite
      Overrides:
      writeTo in class com.google.protobuf.GeneratedMessageV3
      Throws:
      IOException
    • getSerializedSize

      public int getSerializedSize()
      Specified by:
      getSerializedSize in interface com.google.protobuf.MessageLite
      Overrides:
      getSerializedSize in class com.google.protobuf.GeneratedMessageV3
    • equals

      public boolean equals(Object obj)
      Specified by:
      equals in interface com.google.protobuf.Message
      Overrides:
      equals in class com.google.protobuf.AbstractMessage
    • hashCode

      public int hashCode()
      Specified by:
      hashCode in interface com.google.protobuf.Message
      Overrides:
      hashCode in class com.google.protobuf.AbstractMessage
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(ByteBuffer data) throws com.google.protobuf.InvalidProtocolBufferException
      Throws:
      com.google.protobuf.InvalidProtocolBufferException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(ByteBuffer data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException
      Throws:
      com.google.protobuf.InvalidProtocolBufferException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException
      Throws:
      com.google.protobuf.InvalidProtocolBufferException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException
      Throws:
      com.google.protobuf.InvalidProtocolBufferException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException
      Throws:
      com.google.protobuf.InvalidProtocolBufferException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException
      Throws:
      com.google.protobuf.InvalidProtocolBufferException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(InputStream input) throws IOException
      Throws:
      IOException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws IOException
      Throws:
      IOException
    • parseDelimitedFrom

      public static SentencepieceModel.TrainerSpec parseDelimitedFrom(InputStream input) throws IOException
      Throws:
      IOException
    • parseDelimitedFrom

      public static SentencepieceModel.TrainerSpec parseDelimitedFrom(InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws IOException
      Throws:
      IOException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(com.google.protobuf.CodedInputStream input) throws IOException
      Throws:
      IOException
    • parseFrom

      public static SentencepieceModel.TrainerSpec parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws IOException
      Throws:
      IOException
    • newBuilderForType

      public SentencepieceModel.TrainerSpec.Builder newBuilderForType()
      Specified by:
      newBuilderForType in interface com.google.protobuf.Message
      Specified by:
      newBuilderForType in interface com.google.protobuf.MessageLite
    • newBuilder

      public static SentencepieceModel.TrainerSpec.Builder newBuilder()
    • newBuilder

    • toBuilder

      Specified by:
      toBuilder in interface com.google.protobuf.Message
      Specified by:
      toBuilder in interface com.google.protobuf.MessageLite
    • getDefaultInstance

      public static SentencepieceModel.TrainerSpec getDefaultInstance()
    • parser

      public static com.google.protobuf.Parser<SentencepieceModel.TrainerSpec> parser()
    • getParserForType

      public com.google.protobuf.Parser<SentencepieceModel.TrainerSpec> getParserForType()
      Specified by:
      getParserForType in interface com.google.protobuf.Message
      Specified by:
      getParserForType in interface com.google.protobuf.MessageLite
      Overrides:
      getParserForType in class com.google.protobuf.GeneratedMessageV3
    • getDefaultInstanceForType

      public SentencepieceModel.TrainerSpec getDefaultInstanceForType()
      Specified by:
      getDefaultInstanceForType in interface com.google.protobuf.GeneratedMessageV3.ExtendableMessageOrBuilder<SentencepieceModel.TrainerSpec>
      Specified by:
      getDefaultInstanceForType in interface com.google.protobuf.MessageLiteOrBuilder
      Specified by:
      getDefaultInstanceForType in interface com.google.protobuf.MessageOrBuilder