com.google.protobuf.AbstractMessageLite

com.google.protobuf.AbstractMessage

com.google.protobuf.GeneratedMessageV3

com.google.protobuf.GeneratedMessageV3.ExtendableMessage<SentencepieceModel.TrainerSpec>

com.google.genai.proto.SentencepieceModel.TrainerSpec

All Implemented Interfaces:: SentencepieceModel.TrainerSpecOrBuilder, com.google.protobuf.GeneratedMessageV3.ExtendableMessageOrBuilder<SentencepieceModel.TrainerSpec>, com.google.protobuf.Message, com.google.protobuf.MessageLite, com.google.protobuf.MessageLiteOrBuilder, com.google.protobuf.MessageOrBuilder, Serializable

Enclosing class:: SentencepieceModel

public static final class SentencepieceModel.TrainerSpec extends com.google.protobuf.GeneratedMessageV3.ExtendableMessage<SentencepieceModel.TrainerSpec> implements SentencepieceModel.TrainerSpecOrBuilder

 TrainerSpec encodes a various parameters for SentencePiece training.
 Next id: 55

Protobuf type com.google.genai.proto.TrainerSpec

See Also:

Serialized Form

Nested Class Summary

Nested Classes

Modifier and Type

Class

Description

static final class

SentencepieceModel.TrainerSpec.Builder

TrainerSpec encodes a various parameters for SentencePiece training.

static enum

SentencepieceModel.TrainerSpec.ModelType

Model type.

Nested classes/interfaces inherited from class com.google.protobuf.GeneratedMessageV3
com.google.protobuf.GeneratedMessageV3.ExtendableBuilder<MessageT extends com.google.protobuf.GeneratedMessageV3.ExtendableMessage<MessageT>,BuilderT extends com.google.protobuf.GeneratedMessageV3.ExtendableBuilder<MessageT,BuilderT>>, com.google.protobuf.GeneratedMessageV3.ExtendableMessage<MessageT extends com.google.protobuf.GeneratedMessageV3.ExtendableMessage<MessageT>>, com.google.protobuf.GeneratedMessageV3.ExtendableMessageOrBuilder<MessageT extends com.google.protobuf.GeneratedMessageV3.ExtendableMessage<MessageT>>, com.google.protobuf.GeneratedMessageV3.FieldAccessorTable
Field Summary

Fields

Modifier and Type

Field

Description

static final int

ACCEPT_LANGUAGE_FIELD_NUMBER

static final int

ALLOW_WHITESPACE_ONLY_PIECES_FIELD_NUMBER

static final int

BOS_ID_FIELD_NUMBER

static final int

BOS_PIECE_FIELD_NUMBER

static final int

BYTE_FALLBACK_FIELD_NUMBER

static final int

CHARACTER_COVERAGE_FIELD_NUMBER

static final int

CONTROL_SYMBOLS_FIELD_NUMBER

static final int

DIFFERENTIAL_PRIVACY_CLIPPING_THRESHOLD_FIELD_NUMBER

static final int

DIFFERENTIAL_PRIVACY_NOISE_LEVEL_FIELD_NUMBER

static final int

ENABLE_DIFFERENTIAL_PRIVACY_FIELD_NUMBER

static final int

EOS_ID_FIELD_NUMBER

static final int

EOS_PIECE_FIELD_NUMBER

static final int

HARD_VOCAB_LIMIT_FIELD_NUMBER

static final int

INPUT_FIELD_NUMBER

static final int

INPUT_FORMAT_FIELD_NUMBER

static final int

INPUT_SENTENCE_SIZE_FIELD_NUMBER

static final int

MAX_SENTENCE_LENGTH_FIELD_NUMBER

static final int

MAX_SENTENCEPIECE_LENGTH_FIELD_NUMBER

static final int

MINING_SENTENCE_SIZE_FIELD_NUMBER

static final int

MODEL_PREFIX_FIELD_NUMBER

static final int

MODEL_TYPE_FIELD_NUMBER

static final int

NUM_SUB_ITERATIONS_FIELD_NUMBER

static final int

NUM_THREADS_FIELD_NUMBER

static final int

PAD_ID_FIELD_NUMBER

static final int

PAD_PIECE_FIELD_NUMBER

static final com.google.protobuf.Parser<SentencepieceModel.TrainerSpec>

PARSER

Deprecated.

static final int

PRETOKENIZATION_DELIMITER_FIELD_NUMBER

static final int

REQUIRED_CHARS_FIELD_NUMBER

static final int

SEED_SENTENCEPIECE_SIZE_FIELD_NUMBER

static final int

SEED_SENTENCEPIECES_FILE_FIELD_NUMBER

static final int

SELF_TEST_SAMPLE_SIZE_FIELD_NUMBER

static final int

SHRINKING_FACTOR_FIELD_NUMBER

static final int

SHUFFLE_INPUT_SENTENCE_FIELD_NUMBER

static final int

SPLIT_BY_NUMBER_FIELD_NUMBER

static final int

SPLIT_BY_UNICODE_SCRIPT_FIELD_NUMBER

static final int

SPLIT_BY_WHITESPACE_FIELD_NUMBER

static final int

SPLIT_DIGITS_FIELD_NUMBER

static final int

TRAIN_EXTREMELY_LARGE_CORPUS_FIELD_NUMBER

static final int

TRAINING_SENTENCE_SIZE_FIELD_NUMBER

static final int

TREAT_WHITESPACE_AS_SUFFIX_FIELD_NUMBER

static final int

UNK_ID_FIELD_NUMBER

static final int

UNK_PIECE_FIELD_NUMBER

static final int

UNK_SURFACE_FIELD_NUMBER

static final int

USE_ALL_VOCAB_FIELD_NUMBER

static final int

USER_DEFINED_SYMBOLS_FIELD_NUMBER

static final int

VOCAB_SIZE_FIELD_NUMBER

static final int

VOCABULARY_OUTPUT_PIECE_SCORE_FIELD_NUMBER
Method Summary

Modifier and Type

Method

Description

boolean

equals(Object obj)

String

getAcceptLanguage(int index)

List of the languages this model can accept.

com.google.protobuf.ByteString

getAcceptLanguageBytes(int index)

List of the languages this model can accept.

int

getAcceptLanguageCount()

List of the languages this model can accept.

com.google.protobuf.ProtocolStringList

getAcceptLanguageList()

List of the languages this model can accept.

boolean

getAllowWhitespaceOnlyPieces()

Allows pieces that only contain whitespaces instead of appearing only as prefix or suffix of other pieces.

int

getBosId()

<s>

String

getBosPiece()

optional string bos_piece = 46 [default = "<s>"];

com.google.protobuf.ByteString

getBosPieceBytes()

optional string bos_piece = 46 [default = "<s>"];

boolean

getByteFallback()

Decomposes unknown pieces into UTF-8 bytes.

float

getCharacterCoverage()

///////////////////////////////////////////////////////////////// Training parameters.

String

getControlSymbols(int index)

///////////////////////////////////////////////////////////////// Vocabulary management Defines control symbols used as an indicator to change the behavior of the decoder.

com.google.protobuf.ByteString

getControlSymbolsBytes(int index)

///////////////////////////////////////////////////////////////// Vocabulary management Defines control symbols used as an indicator to change the behavior of the decoder.

int

getControlSymbolsCount()

///////////////////////////////////////////////////////////////// Vocabulary management Defines control symbols used as an indicator to change the behavior of the decoder.

com.google.protobuf.ProtocolStringList

getControlSymbolsList()

///////////////////////////////////////////////////////////////// Vocabulary management Defines control symbols used as an indicator to change the behavior of the decoder.

static SentencepieceModel.TrainerSpec

getDefaultInstance()

SentencepieceModel.TrainerSpec

getDefaultInstanceForType()

static final com.google.protobuf.Descriptors.Descriptor

getDescriptor()

long

getDifferentialPrivacyClippingThreshold()

Clipping threshold to apply after adding noise.

float

getDifferentialPrivacyNoiseLevel()

Set these parameters if you need DP version of sentencepiece.

boolean

getEnableDifferentialPrivacy()

Whether to use DP version of sentencepiece.

int

getEosId()

</s>

String

getEosPiece()

optional string eos_piece = 47 [default = "</s>"];

com.google.protobuf.ByteString

getEosPieceBytes()

optional string eos_piece = 47 [default = "</s>"];

boolean

getHardVocabLimit()

`vocab_size` is treated as hard limit.

String

getInput(int index)

///////////////////////////////////////////////////////////////// General parameters Input corpus files.

com.google.protobuf.ByteString

getInputBytes(int index)

///////////////////////////////////////////////////////////////// General parameters Input corpus files.

int

getInputCount()

///////////////////////////////////////////////////////////////// General parameters Input corpus files.

String

getInputFormat()

Input corpus format: "text": one-sentence-per-line text format (default) "tsv": sentence <tab> freq

com.google.protobuf.ByteString

getInputFormatBytes()

Input corpus format: "text": one-sentence-per-line text format (default) "tsv": sentence <tab> freq

com.google.protobuf.ProtocolStringList

getInputList()

///////////////////////////////////////////////////////////////// General parameters Input corpus files.

long

getInputSentenceSize()

Maximum size of sentences the trainer loads from `input` parameter.

int

getMaxSentenceLength()

The maximum sentence length in byte.

int

getMaxSentencepieceLength()

///////////////////////////////////////////////////////////////// SentencePiece parameters which control the shapes of sentence piece.

int

getMiningSentenceSize()

Deprecated.
com.google.genai.proto.TrainerSpec.mining_sentence_size is deprecated.

String

getModelPrefix()

Output model file prefix.

com.google.protobuf.ByteString

getModelPrefixBytes()

Output model file prefix.

SentencepieceModel.TrainerSpec.ModelType

getModelType()

optional .com.google.genai.proto.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];

int

getNumSubIterations()

Number of EM sub iterations.

int

getNumThreads()

Number of threads in the training.

int

getPadId()

<pad> (padding)

String

getPadPiece()

optional string pad_piece = 48 [default = "<pad>"];

com.google.protobuf.ByteString

getPadPieceBytes()

optional string pad_piece = 48 [default = "<pad>"];

com.google.protobuf.Parser<SentencepieceModel.TrainerSpec>

getParserForType()

String

getPretokenizationDelimiter()

Defines the pre-tokenization delimiter.

com.google.protobuf.ByteString

getPretokenizationDelimiterBytes()

Defines the pre-tokenization delimiter.

String

getRequiredChars()

Defines required characters.

com.google.protobuf.ByteString

getRequiredCharsBytes()

Defines required characters.

String

getSeedSentencepiecesFile()

Path to a seed sentencepieces file, with one tab-separated seed sentencepiece <tab> frequency per line.

com.google.protobuf.ByteString

getSeedSentencepiecesFileBytes()

Path to a seed sentencepieces file, with one tab-separated seed sentencepiece <tab> frequency per line.

int

getSeedSentencepieceSize()

The size of seed sentencepieces.

int

getSelfTestSampleSize()

Size of self-test samples, which are encoded in the model file.

int

getSerializedSize()

float

getShrinkingFactor()

In every EM sub-iterations, keeps top `shrinking_factor` * `current sentencepieces size` with respect to the loss of the sentence piece.

boolean

getShuffleInputSentence()

optional bool shuffle_input_sentence = 19 [default = true];

boolean

getSplitByNumber()

When `split_by_number` is true, put a boundary between number and non-number transition.

boolean

getSplitByUnicodeScript()

Uses Unicode script to split sentence pieces.

boolean

getSplitByWhitespace()

Use a white space to split sentence pieces.

boolean

getSplitDigits()

Split all digits (0-9) into separate pieces.

boolean

getTrainExtremelyLargeCorpus()

Increase bit depth to allow unigram model training on large (>10M sentences) corpora.

int

getTrainingSentenceSize()

Deprecated.
com.google.genai.proto.TrainerSpec.training_sentence_size is deprecated.

boolean

getTreatWhitespaceAsSuffix()

Adds whitespace symbol (_) as a suffix instead of prefix.

int

getUnkId()

///////////////////////////////////////////////////////////////// Reserved special meta tokens.

String

getUnkPiece()

optional string unk_piece = 45 [default = "<unk>"];

com.google.protobuf.ByteString

getUnkPieceBytes()

optional string unk_piece = 45 [default = "<unk>"];

String

getUnkSurface()

Encodes <unk> into U+2047 (DOUBLE QUESTION MARK), since this character can be useful both for user and developer.

com.google.protobuf.ByteString

getUnkSurfaceBytes()

Encodes <unk> into U+2047 (DOUBLE QUESTION MARK), since this character can be useful both for user and developer.

boolean

getUseAllVocab()

use all symbols for vocab extraction.

String

getUserDefinedSymbols(int index)

Defines user defined symbols.

com.google.protobuf.ByteString

getUserDefinedSymbolsBytes(int index)

Defines user defined symbols.

int

getUserDefinedSymbolsCount()

Defines user defined symbols.

com.google.protobuf.ProtocolStringList

getUserDefinedSymbolsList()

Defines user defined symbols.

int

getVocabSize()

Vocabulary size.

boolean

getVocabularyOutputPieceScore()

When creating the vocabulary file, defines whether or not to additionally output the score for each piece.

boolean

hasAllowWhitespaceOnlyPieces()

Allows pieces that only contain whitespaces instead of appearing only as prefix or suffix of other pieces.

boolean

hasBosId()

<s>

boolean

hasBosPiece()

optional string bos_piece = 46 [default = "<s>"];

boolean

hasByteFallback()

Decomposes unknown pieces into UTF-8 bytes.

boolean

hasCharacterCoverage()

///////////////////////////////////////////////////////////////// Training parameters.

boolean

hasDifferentialPrivacyClippingThreshold()

Clipping threshold to apply after adding noise.

boolean

hasDifferentialPrivacyNoiseLevel()

Set these parameters if you need DP version of sentencepiece.

boolean

hasEnableDifferentialPrivacy()

Whether to use DP version of sentencepiece.

boolean

hasEosId()

</s>

boolean

hasEosPiece()

optional string eos_piece = 47 [default = "</s>"];

boolean

hasHardVocabLimit()

`vocab_size` is treated as hard limit.

int

hashCode()

boolean

hasInputFormat()

Input corpus format: "text": one-sentence-per-line text format (default) "tsv": sentence <tab> freq

boolean

hasInputSentenceSize()

Maximum size of sentences the trainer loads from `input` parameter.

boolean

hasMaxSentenceLength()

The maximum sentence length in byte.

boolean

hasMaxSentencepieceLength()

///////////////////////////////////////////////////////////////// SentencePiece parameters which control the shapes of sentence piece.

boolean

hasMiningSentenceSize()

Deprecated.
com.google.genai.proto.TrainerSpec.mining_sentence_size is deprecated.

boolean

hasModelPrefix()

Output model file prefix.

boolean

hasModelType()

optional .com.google.genai.proto.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];

boolean

hasNumSubIterations()

Number of EM sub iterations.

boolean

hasNumThreads()

Number of threads in the training.

boolean

hasPadId()

<pad> (padding)

boolean

hasPadPiece()

optional string pad_piece = 48 [default = "<pad>"];

boolean

hasPretokenizationDelimiter()

Defines the pre-tokenization delimiter.

boolean

hasRequiredChars()

Defines required characters.

boolean

hasSeedSentencepiecesFile()

Path to a seed sentencepieces file, with one tab-separated seed sentencepiece <tab> frequency per line.

boolean

hasSeedSentencepieceSize()

The size of seed sentencepieces.

boolean

hasSelfTestSampleSize()

Size of self-test samples, which are encoded in the model file.

boolean

hasShrinkingFactor()

In every EM sub-iterations, keeps top `shrinking_factor` * `current sentencepieces size` with respect to the loss of the sentence piece.

boolean

hasShuffleInputSentence()

optional bool shuffle_input_sentence = 19 [default = true];

boolean

hasSplitByNumber()

When `split_by_number` is true, put a boundary between number and non-number transition.

boolean

hasSplitByUnicodeScript()

Uses Unicode script to split sentence pieces.

boolean

hasSplitByWhitespace()

Use a white space to split sentence pieces.

boolean

hasSplitDigits()

Split all digits (0-9) into separate pieces.

boolean

hasTrainExtremelyLargeCorpus()

Increase bit depth to allow unigram model training on large (>10M sentences) corpora.

boolean

hasTrainingSentenceSize()

Deprecated.
com.google.genai.proto.TrainerSpec.training_sentence_size is deprecated.

boolean

hasTreatWhitespaceAsSuffix()

Adds whitespace symbol (_) as a suffix instead of prefix.

boolean

hasUnkId()

///////////////////////////////////////////////////////////////// Reserved special meta tokens.

boolean

hasUnkPiece()

optional string unk_piece = 45 [default = "<unk>"];

boolean

hasUnkSurface()

Encodes <unk> into U+2047 (DOUBLE QUESTION MARK), since this character can be useful both for user and developer.

boolean

hasUseAllVocab()

use all symbols for vocab extraction.

boolean

hasVocabSize()

Vocabulary size.

boolean

hasVocabularyOutputPieceScore()

When creating the vocabulary file, defines whether or not to additionally output the score for each piece.

final boolean

isInitialized()

static SentencepieceModel.TrainerSpec.Builder

newBuilder()

static SentencepieceModel.TrainerSpec.Builder

newBuilder(SentencepieceModel.TrainerSpec prototype)

SentencepieceModel.TrainerSpec.Builder

newBuilderForType()

static SentencepieceModel.TrainerSpec

parseDelimitedFrom(InputStream input)

static SentencepieceModel.TrainerSpec

parseDelimitedFrom(InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)

static SentencepieceModel.TrainerSpec

parseFrom(byte[] data)

static SentencepieceModel.TrainerSpec

parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)

static SentencepieceModel.TrainerSpec

parseFrom(com.google.protobuf.ByteString data)

static SentencepieceModel.TrainerSpec

parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)

static SentencepieceModel.TrainerSpec

parseFrom(com.google.protobuf.CodedInputStream input)

static SentencepieceModel.TrainerSpec

parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)

static SentencepieceModel.TrainerSpec

parseFrom(InputStream input)

static SentencepieceModel.TrainerSpec

parseFrom(InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)

static SentencepieceModel.TrainerSpec

parseFrom(ByteBuffer data)

static SentencepieceModel.TrainerSpec

parseFrom(ByteBuffer data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)

static com.google.protobuf.Parser<SentencepieceModel.TrainerSpec>

parser()

SentencepieceModel.TrainerSpec.Builder

toBuilder()

void

writeTo(com.google.protobuf.CodedOutputStream output)

Methods inherited from class com.google.protobuf.GeneratedMessageV3.ExtendableMessage
getAllFields, getAllFieldsRaw, getExtension, getExtension, getExtension, getExtension, getExtension, getExtension, getExtensionCount, getExtensionCount, getExtensionCount, getField, getRepeatedField, getRepeatedFieldCount, hasExtension, hasExtension, hasExtension, hasField

Methods inherited from class com.google.protobuf.GeneratedMessageV3
getDescriptorForType, getOneofFieldDescriptor, getUnknownFields, hasOneof

Methods inherited from class com.google.protobuf.AbstractMessage
findInitializationErrors, getInitializationErrorString, toString

Methods inherited from class com.google.protobuf.AbstractMessageLite
toByteArray, toByteString, writeDelimitedTo, writeTo

Methods inherited from class java.lang.Object
getClass, notify, notifyAll, wait, wait, wait

Methods inherited from interface com.google.protobuf.GeneratedMessageV3.ExtendableMessageOrBuilder
getExtension, getExtension, getExtension, getExtension, getExtension, getExtension, getExtensionCount, getExtensionCount, getExtensionCount, hasExtension, hasExtension, hasExtension

Methods inherited from interface com.google.protobuf.MessageLite
toByteArray, toByteString, writeDelimitedTo, writeTo

Methods inherited from interface com.google.protobuf.MessageOrBuilder
findInitializationErrors, getAllFields, getDescriptorForType, getField, getInitializationErrorString, getOneofFieldDescriptor, getRepeatedField, getRepeatedFieldCount, getUnknownFields, hasField, hasOneof

Field Details
- INPUT_FIELD_NUMBER
  
  public static final int INPUT_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- INPUT_FORMAT_FIELD_NUMBER
  
  public static final int INPUT_FORMAT_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- MODEL_PREFIX_FIELD_NUMBER
  
  public static final int MODEL_PREFIX_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- MODEL_TYPE_FIELD_NUMBER
  
  public static final int MODEL_TYPE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- VOCAB_SIZE_FIELD_NUMBER
  
  public static final int VOCAB_SIZE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- ACCEPT_LANGUAGE_FIELD_NUMBER
  
  public static final int ACCEPT_LANGUAGE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SELF_TEST_SAMPLE_SIZE_FIELD_NUMBER
  
  public static final int SELF_TEST_SAMPLE_SIZE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- ENABLE_DIFFERENTIAL_PRIVACY_FIELD_NUMBER
  
  public static final int ENABLE_DIFFERENTIAL_PRIVACY_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- DIFFERENTIAL_PRIVACY_NOISE_LEVEL_FIELD_NUMBER
  
  public static final int DIFFERENTIAL_PRIVACY_NOISE_LEVEL_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- DIFFERENTIAL_PRIVACY_CLIPPING_THRESHOLD_FIELD_NUMBER
  
  public static final int DIFFERENTIAL_PRIVACY_CLIPPING_THRESHOLD_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- CHARACTER_COVERAGE_FIELD_NUMBER
  
  public static final int CHARACTER_COVERAGE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- INPUT_SENTENCE_SIZE_FIELD_NUMBER
  
  public static final int INPUT_SENTENCE_SIZE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SHUFFLE_INPUT_SENTENCE_FIELD_NUMBER
  
  public static final int SHUFFLE_INPUT_SENTENCE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- MINING_SENTENCE_SIZE_FIELD_NUMBER
  
  public static final int MINING_SENTENCE_SIZE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- TRAINING_SENTENCE_SIZE_FIELD_NUMBER
  
  public static final int TRAINING_SENTENCE_SIZE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SEED_SENTENCEPIECE_SIZE_FIELD_NUMBER
  
  public static final int SEED_SENTENCEPIECE_SIZE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SHRINKING_FACTOR_FIELD_NUMBER
  
  public static final int SHRINKING_FACTOR_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- MAX_SENTENCE_LENGTH_FIELD_NUMBER
  
  public static final int MAX_SENTENCE_LENGTH_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- NUM_THREADS_FIELD_NUMBER
  
  public static final int NUM_THREADS_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- NUM_SUB_ITERATIONS_FIELD_NUMBER
  
  public static final int NUM_SUB_ITERATIONS_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- MAX_SENTENCEPIECE_LENGTH_FIELD_NUMBER
  
  public static final int MAX_SENTENCEPIECE_LENGTH_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SPLIT_BY_UNICODE_SCRIPT_FIELD_NUMBER
  
  public static final int SPLIT_BY_UNICODE_SCRIPT_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SPLIT_BY_NUMBER_FIELD_NUMBER
  
  public static final int SPLIT_BY_NUMBER_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SPLIT_BY_WHITESPACE_FIELD_NUMBER
  
  public static final int SPLIT_BY_WHITESPACE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- TREAT_WHITESPACE_AS_SUFFIX_FIELD_NUMBER
  
  public static final int TREAT_WHITESPACE_AS_SUFFIX_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- ALLOW_WHITESPACE_ONLY_PIECES_FIELD_NUMBER
  
  public static final int ALLOW_WHITESPACE_ONLY_PIECES_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SPLIT_DIGITS_FIELD_NUMBER
  
  public static final int SPLIT_DIGITS_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- PRETOKENIZATION_DELIMITER_FIELD_NUMBER
  
  public static final int PRETOKENIZATION_DELIMITER_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- CONTROL_SYMBOLS_FIELD_NUMBER
  
  public static final int CONTROL_SYMBOLS_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- USER_DEFINED_SYMBOLS_FIELD_NUMBER
  
  public static final int USER_DEFINED_SYMBOLS_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- REQUIRED_CHARS_FIELD_NUMBER
  
  public static final int REQUIRED_CHARS_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- BYTE_FALLBACK_FIELD_NUMBER
  
  public static final int BYTE_FALLBACK_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- VOCABULARY_OUTPUT_PIECE_SCORE_FIELD_NUMBER
  
  public static final int VOCABULARY_OUTPUT_PIECE_SCORE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- HARD_VOCAB_LIMIT_FIELD_NUMBER
  
  public static final int HARD_VOCAB_LIMIT_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- USE_ALL_VOCAB_FIELD_NUMBER
  
  public static final int USE_ALL_VOCAB_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- UNK_ID_FIELD_NUMBER
  
  public static final int UNK_ID_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- BOS_ID_FIELD_NUMBER
  
  public static final int BOS_ID_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- EOS_ID_FIELD_NUMBER
  
  public static final int EOS_ID_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- PAD_ID_FIELD_NUMBER
  
  public static final int PAD_ID_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- UNK_PIECE_FIELD_NUMBER
  
  public static final int UNK_PIECE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- BOS_PIECE_FIELD_NUMBER
  
  public static final int BOS_PIECE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- EOS_PIECE_FIELD_NUMBER
  
  public static final int EOS_PIECE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- PAD_PIECE_FIELD_NUMBER
  
  public static final int PAD_PIECE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- UNK_SURFACE_FIELD_NUMBER
  
  public static final int UNK_SURFACE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- TRAIN_EXTREMELY_LARGE_CORPUS_FIELD_NUMBER
  
  public static final int TRAIN_EXTREMELY_LARGE_CORPUS_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- SEED_SENTENCEPIECES_FILE_FIELD_NUMBER
  
  public static final int SEED_SENTENCEPIECES_FILE_FIELD_NUMBER
  See Also:
  
  Constant Field Values
- PARSER
  
  @Deprecated public static final com.google.protobuf.Parser<SentencepieceModel.TrainerSpec> PARSER
  
  Deprecated.
Method Details
- getDescriptor
  
  public static final com.google.protobuf.Descriptors.Descriptor getDescriptor()
- getInputList
  
  public com.google.protobuf.ProtocolStringList getInputList()
  ///////////////////////////////////////////////////////////////// General parameters Input corpus files. Trainer accepts the following two formats: A) Monolingual: plain text, one sentence per line. B) Bilingual: TSV, source sentence <tab> target sentence When bilingual data is passed, shared vocabulary model is built. Note that the input file must be raw corpus, not a preprocessed corpus. Trainer only loads the first `input_sentence_size` sentences specified with this parameter.
  repeated string input = 1;
  Specified by:
  
  getInputList in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  A list containing the input.
- getInputCount
  
  public int getInputCount()
  ///////////////////////////////////////////////////////////////// General parameters Input corpus files. Trainer accepts the following two formats: A) Monolingual: plain text, one sentence per line. B) Bilingual: TSV, source sentence <tab> target sentence When bilingual data is passed, shared vocabulary model is built. Note that the input file must be raw corpus, not a preprocessed corpus. Trainer only loads the first `input_sentence_size` sentences specified with this parameter.
  repeated string input = 1;
  Specified by:
  
  getInputCount in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The count of input.
- getInput
  
  public String getInput(int index)
  ///////////////////////////////////////////////////////////////// General parameters Input corpus files. Trainer accepts the following two formats: A) Monolingual: plain text, one sentence per line. B) Bilingual: TSV, source sentence <tab> target sentence When bilingual data is passed, shared vocabulary model is built. Note that the input file must be raw corpus, not a preprocessed corpus. Trainer only loads the first `input_sentence_size` sentences specified with this parameter.
  repeated string input = 1;
  Specified by:
  
  getInput in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Parameters:
  
  index - The index of the element to return.
  
  Returns:
  
  The input at the given index.
- getInputBytes
  
  public com.google.protobuf.ByteString getInputBytes(int index)
  ///////////////////////////////////////////////////////////////// General parameters Input corpus files. Trainer accepts the following two formats: A) Monolingual: plain text, one sentence per line. B) Bilingual: TSV, source sentence <tab> target sentence When bilingual data is passed, shared vocabulary model is built. Note that the input file must be raw corpus, not a preprocessed corpus. Trainer only loads the first `input_sentence_size` sentences specified with this parameter.
  repeated string input = 1;
  Specified by:
  
  getInputBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Parameters:
  
  index - The index of the value to return.
  
  Returns:
  
  The bytes of the input at the given index.
- hasInputFormat
  
  public boolean hasInputFormat()
  Input corpus format: "text": one-sentence-per-line text format (default) "tsv": sentence <tab> freq
  optional string input_format = 7;
  Specified by:
  
  hasInputFormat in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the inputFormat field is set.
- getInputFormat
  
  public String getInputFormat()
  Input corpus format: "text": one-sentence-per-line text format (default) "tsv": sentence <tab> freq
  optional string input_format = 7;
  Specified by:
  
  getInputFormat in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The inputFormat.
- getInputFormatBytes
  
  public com.google.protobuf.ByteString getInputFormatBytes()
  Input corpus format: "text": one-sentence-per-line text format (default) "tsv": sentence <tab> freq
  optional string input_format = 7;
  Specified by:
  
  getInputFormatBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for inputFormat.
- hasModelPrefix
  
  public boolean hasModelPrefix()
  Output model file prefix. <model_prefix>.model and <model_prefix>.vocab are generated.
  optional string model_prefix = 2;
  Specified by:
  
  hasModelPrefix in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the modelPrefix field is set.
- getModelPrefix
  
  public String getModelPrefix()
  Output model file prefix. <model_prefix>.model and <model_prefix>.vocab are generated.
  optional string model_prefix = 2;
  Specified by:
  
  getModelPrefix in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The modelPrefix.
- getModelPrefixBytes
  
  public com.google.protobuf.ByteString getModelPrefixBytes()
  Output model file prefix. <model_prefix>.model and <model_prefix>.vocab are generated.
  optional string model_prefix = 2;
  Specified by:
  
  getModelPrefixBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for modelPrefix.
- hasModelType
  
  public boolean hasModelType()
  
  optional .com.google.genai.proto.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
  
  Specified by:
  
  hasModelType in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the modelType field is set.
- getModelType
  
  public SentencepieceModel.TrainerSpec.ModelType getModelType()
  
  optional .com.google.genai.proto.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
  
  Specified by:
  
  getModelType in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The modelType.
- hasVocabSize
  
  public boolean hasVocabSize()
  Vocabulary size. 8k is the default size.
  optional int32 vocab_size = 4 [default = 8000];
  Specified by:
  
  hasVocabSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the vocabSize field is set.
- getVocabSize
  
  public int getVocabSize()
  Vocabulary size. 8k is the default size.
  optional int32 vocab_size = 4 [default = 8000];
  Specified by:
  
  getVocabSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The vocabSize.
- getAcceptLanguageList
  
  public com.google.protobuf.ProtocolStringList getAcceptLanguageList()
  List of the languages this model can accept. Since the model is language-agnostic, this field is used as a reference.
  repeated string accept_language = 5;
  Specified by:
  
  getAcceptLanguageList in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  A list containing the acceptLanguage.
- getAcceptLanguageCount
  
  public int getAcceptLanguageCount()
  List of the languages this model can accept. Since the model is language-agnostic, this field is used as a reference.
  repeated string accept_language = 5;
  Specified by:
  
  getAcceptLanguageCount in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The count of acceptLanguage.
- getAcceptLanguage
  
  public String getAcceptLanguage(int index)
  List of the languages this model can accept. Since the model is language-agnostic, this field is used as a reference.
  repeated string accept_language = 5;
  Specified by:
  
  getAcceptLanguage in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Parameters:
  
  index - The index of the element to return.
  
  Returns:
  
  The acceptLanguage at the given index.
- getAcceptLanguageBytes
  
  public com.google.protobuf.ByteString getAcceptLanguageBytes(int index)
  List of the languages this model can accept. Since the model is language-agnostic, this field is used as a reference.
  repeated string accept_language = 5;
  Specified by:
  
  getAcceptLanguageBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Parameters:
  
  index - The index of the value to return.
  
  Returns:
  
  The bytes of the acceptLanguage at the given index.
- hasSelfTestSampleSize
  
  public boolean hasSelfTestSampleSize()
  Size of self-test samples, which are encoded in the model file.
  optional int32 self_test_sample_size = 6 [default = 0];
  Specified by:
  
  hasSelfTestSampleSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the selfTestSampleSize field is set.
- getSelfTestSampleSize
  
  public int getSelfTestSampleSize()
  Size of self-test samples, which are encoded in the model file.
  optional int32 self_test_sample_size = 6 [default = 0];
  Specified by:
  
  getSelfTestSampleSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The selfTestSampleSize.
- hasEnableDifferentialPrivacy
  
  public boolean hasEnableDifferentialPrivacy()
  Whether to use DP version of sentencepiece. Use it with TSV input format (requires precomputed word tab counts to work).
  optional bool enable_differential_privacy = 50 [default = false];
  Specified by:
  
  hasEnableDifferentialPrivacy in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the enableDifferentialPrivacy field is set.
- getEnableDifferentialPrivacy
  
  public boolean getEnableDifferentialPrivacy()
  Whether to use DP version of sentencepiece. Use it with TSV input format (requires precomputed word tab counts to work).
  optional bool enable_differential_privacy = 50 [default = false];
  Specified by:
  
  getEnableDifferentialPrivacy in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The enableDifferentialPrivacy.
- hasDifferentialPrivacyNoiseLevel
  
  public boolean hasDifferentialPrivacyNoiseLevel()
  Set these parameters if you need DP version of sentencepiece. std of noise to add.
  optional float differential_privacy_noise_level = 51 [default = 0];
  Specified by:
  
  hasDifferentialPrivacyNoiseLevel in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the differentialPrivacyNoiseLevel field is set.
- getDifferentialPrivacyNoiseLevel
  
  public float getDifferentialPrivacyNoiseLevel()
  Set these parameters if you need DP version of sentencepiece. std of noise to add.
  optional float differential_privacy_noise_level = 51 [default = 0];
  Specified by:
  
  getDifferentialPrivacyNoiseLevel in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The differentialPrivacyNoiseLevel.
- hasDifferentialPrivacyClippingThreshold
  
  public boolean hasDifferentialPrivacyClippingThreshold()
  Clipping threshold to apply after adding noise. All the words with frequency less than this value are dropped.
  optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
  Specified by:
  
  hasDifferentialPrivacyClippingThreshold in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the differentialPrivacyClippingThreshold field is set.
- getDifferentialPrivacyClippingThreshold
  
  public long getDifferentialPrivacyClippingThreshold()
  Clipping threshold to apply after adding noise. All the words with frequency less than this value are dropped.
  optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
  Specified by:
  
  getDifferentialPrivacyClippingThreshold in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The differentialPrivacyClippingThreshold.
- hasCharacterCoverage
  
  public boolean hasCharacterCoverage()
  ///////////////////////////////////////////////////////////////// Training parameters. Uses characters which cover the corpus with the ratio of `chars_coverage`. This parameter determines the set of basic Alphabet of sentence piece. 1.0 - `chars_coverage` characters are treated as UNK. See also required_chars field.
  optional float character_coverage = 10 [default = 0.9995];
  Specified by:
  
  hasCharacterCoverage in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the characterCoverage field is set.
- getCharacterCoverage
  
  public float getCharacterCoverage()
  ///////////////////////////////////////////////////////////////// Training parameters. Uses characters which cover the corpus with the ratio of `chars_coverage`. This parameter determines the set of basic Alphabet of sentence piece. 1.0 - `chars_coverage` characters are treated as UNK. See also required_chars field.
  optional float character_coverage = 10 [default = 0.9995];
  Specified by:
  
  getCharacterCoverage in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The characterCoverage.
- hasInputSentenceSize
  
  public boolean hasInputSentenceSize()
  Maximum size of sentences the trainer loads from `input` parameter. Trainer simply loads the `input` files in sequence. It is better to shuffle the input corpus randomly.
  optional uint64 input_sentence_size = 11 [default = 0];
  Specified by:
  
  hasInputSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the inputSentenceSize field is set.
- getInputSentenceSize
  
  public long getInputSentenceSize()
  Maximum size of sentences the trainer loads from `input` parameter. Trainer simply loads the `input` files in sequence. It is better to shuffle the input corpus randomly.
  optional uint64 input_sentence_size = 11 [default = 0];
  Specified by:
  
  getInputSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The inputSentenceSize.
- hasShuffleInputSentence
  
  public boolean hasShuffleInputSentence()
  
  optional bool shuffle_input_sentence = 19 [default = true];
  
  Specified by:
  
  hasShuffleInputSentence in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the shuffleInputSentence field is set.
- getShuffleInputSentence
  
  public boolean getShuffleInputSentence()
  
  optional bool shuffle_input_sentence = 19 [default = true];
  
  Specified by:
  
  getShuffleInputSentence in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The shuffleInputSentence.
- hasMiningSentenceSize
  
  @Deprecated public boolean hasMiningSentenceSize()
  
  Deprecated.
  com.google.genai.proto.TrainerSpec.mining_sentence_size is deprecated. See sentencepiece_model.proto;l=96
  Maximum size of sentences to make seed sentence pieces. Extended suffix array is constructed to extract frequent sub-strings from the corpus. This uses 20N working space, where N is the size of corpus.
  optional int32 mining_sentence_size = 12 [deprecated = true];
  Specified by:
  
  hasMiningSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the miningSentenceSize field is set.
- getMiningSentenceSize
  
  @Deprecated public int getMiningSentenceSize()
  
  Deprecated.
  com.google.genai.proto.TrainerSpec.mining_sentence_size is deprecated. See sentencepiece_model.proto;l=96
  Maximum size of sentences to make seed sentence pieces. Extended suffix array is constructed to extract frequent sub-strings from the corpus. This uses 20N working space, where N is the size of corpus.
  optional int32 mining_sentence_size = 12 [deprecated = true];
  Specified by:
  
  getMiningSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The miningSentenceSize.
- hasTrainingSentenceSize
  
  @Deprecated public boolean hasTrainingSentenceSize()
  
  Deprecated.
  com.google.genai.proto.TrainerSpec.training_sentence_size is deprecated. See sentencepiece_model.proto;l=99
  Maximum size of sentences to train sentence pieces.
  optional int32 training_sentence_size = 13 [deprecated = true];
  Specified by:
  
  hasTrainingSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the trainingSentenceSize field is set.
- getTrainingSentenceSize
  
  @Deprecated public int getTrainingSentenceSize()
  
  Deprecated.
  com.google.genai.proto.TrainerSpec.training_sentence_size is deprecated. See sentencepiece_model.proto;l=99
  Maximum size of sentences to train sentence pieces.
  optional int32 training_sentence_size = 13 [deprecated = true];
  Specified by:
  
  getTrainingSentenceSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The trainingSentenceSize.
- hasSeedSentencepieceSize
  
  public boolean hasSeedSentencepieceSize()
  The size of seed sentencepieces. `seed_sentencepiece_size` must be larger than `vocab_size`.
  optional int32 seed_sentencepiece_size = 14 [default = 1000000];
  Specified by:
  
  hasSeedSentencepieceSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the seedSentencepieceSize field is set.
- getSeedSentencepieceSize
  
  public int getSeedSentencepieceSize()
  The size of seed sentencepieces. `seed_sentencepiece_size` must be larger than `vocab_size`.
  optional int32 seed_sentencepiece_size = 14 [default = 1000000];
  Specified by:
  
  getSeedSentencepieceSize in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The seedSentencepieceSize.
- hasShrinkingFactor
  
  public boolean hasShrinkingFactor()
  In every EM sub-iterations, keeps top `shrinking_factor` * `current sentencepieces size` with respect to the loss of the sentence piece. This value should be smaller than 1.0.
  optional float shrinking_factor = 15 [default = 0.75];
  Specified by:
  
  hasShrinkingFactor in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the shrinkingFactor field is set.
- getShrinkingFactor
  
  public float getShrinkingFactor()
  In every EM sub-iterations, keeps top `shrinking_factor` * `current sentencepieces size` with respect to the loss of the sentence piece. This value should be smaller than 1.0.
  optional float shrinking_factor = 15 [default = 0.75];
  Specified by:
  
  getShrinkingFactor in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The shrinkingFactor.
- hasMaxSentenceLength
  
  public boolean hasMaxSentenceLength()
  The maximum sentence length in byte. The sentences with the length larger than `max_sentence_length` is simply ignored. Longer input tends to bring the following risks: * Overflow during EM training (unigram language model only) * Performance drop because of O(n log n) cost in BPE.
  optional int32 max_sentence_length = 18 [default = 4192];
  Specified by:
  
  hasMaxSentenceLength in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the maxSentenceLength field is set.
- getMaxSentenceLength
  
  public int getMaxSentenceLength()
  The maximum sentence length in byte. The sentences with the length larger than `max_sentence_length` is simply ignored. Longer input tends to bring the following risks: * Overflow during EM training (unigram language model only) * Performance drop because of O(n log n) cost in BPE.
  optional int32 max_sentence_length = 18 [default = 4192];
  Specified by:
  
  getMaxSentenceLength in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The maxSentenceLength.
- hasNumThreads
  
  public boolean hasNumThreads()
  Number of threads in the training.
  optional int32 num_threads = 16 [default = 16];
  Specified by:
  
  hasNumThreads in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the numThreads field is set.
- getNumThreads
  
  public int getNumThreads()
  Number of threads in the training.
  optional int32 num_threads = 16 [default = 16];
  Specified by:
  
  getNumThreads in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The numThreads.
- hasNumSubIterations
  
  public boolean hasNumSubIterations()
  Number of EM sub iterations.
  optional int32 num_sub_iterations = 17 [default = 2];
  Specified by:
  
  hasNumSubIterations in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the numSubIterations field is set.
- getNumSubIterations
  
  public int getNumSubIterations()
  Number of EM sub iterations.
  optional int32 num_sub_iterations = 17 [default = 2];
  Specified by:
  
  getNumSubIterations in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The numSubIterations.
- hasMaxSentencepieceLength
  
  public boolean hasMaxSentencepieceLength()
  ///////////////////////////////////////////////////////////////// SentencePiece parameters which control the shapes of sentence piece. Maximum length of sentencepiece.
  optional int32 max_sentencepiece_length = 20 [default = 16];
  Specified by:
  
  hasMaxSentencepieceLength in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the maxSentencepieceLength field is set.
- getMaxSentencepieceLength
  
  public int getMaxSentencepieceLength()
  ///////////////////////////////////////////////////////////////// SentencePiece parameters which control the shapes of sentence piece. Maximum length of sentencepiece.
  optional int32 max_sentencepiece_length = 20 [default = 16];
  Specified by:
  
  getMaxSentencepieceLength in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The maxSentencepieceLength.
- hasSplitByUnicodeScript
  
  public boolean hasSplitByUnicodeScript()
  Uses Unicode script to split sentence pieces. When `split_by_unicode_script` is true, we do not allow sentence piece to include multiple Unicode scripts, e.g. "F1" is not a valid piece. Exception: CJ characters (Hiragana/Katakana/Han) are all handled as one script type, since Japanese word can consist of multiple scripts. This exception is always applied regardless of the accept-language parameter.
  optional bool split_by_unicode_script = 21 [default = true];
  Specified by:
  
  hasSplitByUnicodeScript in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the splitByUnicodeScript field is set.
- getSplitByUnicodeScript
  
  public boolean getSplitByUnicodeScript()
  Uses Unicode script to split sentence pieces. When `split_by_unicode_script` is true, we do not allow sentence piece to include multiple Unicode scripts, e.g. "F1" is not a valid piece. Exception: CJ characters (Hiragana/Katakana/Han) are all handled as one script type, since Japanese word can consist of multiple scripts. This exception is always applied regardless of the accept-language parameter.
  optional bool split_by_unicode_script = 21 [default = true];
  Specified by:
  
  getSplitByUnicodeScript in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The splitByUnicodeScript.
- hasSplitByNumber
  
  public boolean hasSplitByNumber()
  When `split_by_number` is true, put a boundary between number and non-number transition. If we want to treat "F1" is one token, set this flag to be false.
  optional bool split_by_number = 23 [default = true];
  Specified by:
  
  hasSplitByNumber in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the splitByNumber field is set.
- getSplitByNumber
  
  public boolean getSplitByNumber()
  When `split_by_number` is true, put a boundary between number and non-number transition. If we want to treat "F1" is one token, set this flag to be false.
  optional bool split_by_number = 23 [default = true];
  Specified by:
  
  getSplitByNumber in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The splitByNumber.
- hasSplitByWhitespace
  
  public boolean hasSplitByWhitespace()
  Use a white space to split sentence pieces. When `split_by_whitespace` is false, we may have the piece containing a white space in the middle. e.g., "in_the".
  optional bool split_by_whitespace = 22 [default = true];
  Specified by:
  
  hasSplitByWhitespace in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the splitByWhitespace field is set.
- getSplitByWhitespace
  
  public boolean getSplitByWhitespace()
  Use a white space to split sentence pieces. When `split_by_whitespace` is false, we may have the piece containing a white space in the middle. e.g., "in_the".
  optional bool split_by_whitespace = 22 [default = true];
  Specified by:
  
  getSplitByWhitespace in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The splitByWhitespace.
- hasTreatWhitespaceAsSuffix
  
  public boolean hasTreatWhitespaceAsSuffix()
  Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => hello_. When `treat_whitespace_as_suffix` is true, NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end of sentence.
  optional bool treat_whitespace_as_suffix = 24 [default = false];
  Specified by:
  
  hasTreatWhitespaceAsSuffix in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the treatWhitespaceAsSuffix field is set.
- getTreatWhitespaceAsSuffix
  
  public boolean getTreatWhitespaceAsSuffix()
  Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => hello_. When `treat_whitespace_as_suffix` is true, NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end of sentence.
  optional bool treat_whitespace_as_suffix = 24 [default = false];
  Specified by:
  
  getTreatWhitespaceAsSuffix in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The treatWhitespaceAsSuffix.
- hasAllowWhitespaceOnlyPieces
  
  public boolean hasAllowWhitespaceOnlyPieces()
  Allows pieces that only contain whitespaces instead of appearing only as prefix or suffix of other pieces.
  optional bool allow_whitespace_only_pieces = 26 [default = false];
  Specified by:
  
  hasAllowWhitespaceOnlyPieces in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the allowWhitespaceOnlyPieces field is set.
- getAllowWhitespaceOnlyPieces
  
  public boolean getAllowWhitespaceOnlyPieces()
  Allows pieces that only contain whitespaces instead of appearing only as prefix or suffix of other pieces.
  optional bool allow_whitespace_only_pieces = 26 [default = false];
  Specified by:
  
  getAllowWhitespaceOnlyPieces in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The allowWhitespaceOnlyPieces.
- hasSplitDigits
  
  public boolean hasSplitDigits()
  Split all digits (0-9) into separate pieces.
  optional bool split_digits = 25 [default = false];
  Specified by:
  
  hasSplitDigits in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the splitDigits field is set.
- getSplitDigits
  
  public boolean getSplitDigits()
  Split all digits (0-9) into separate pieces.
  optional bool split_digits = 25 [default = false];
  Specified by:
  
  getSplitDigits in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The splitDigits.
- hasPretokenizationDelimiter
  
  public boolean hasPretokenizationDelimiter()
  Defines the pre-tokenization delimiter. When specified, no pieces crossing this delimiter is not included in the vocab. Then the delimiter string is virtually ignored during the training. This field can allows constraints on the vocabulary selection. Note that this field is available on unigram mode.
  optional string pretokenization_delimiter = 53 [default = ""];
  Specified by:
  
  hasPretokenizationDelimiter in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the pretokenizationDelimiter field is set.
- getPretokenizationDelimiter
  
  public String getPretokenizationDelimiter()
  Defines the pre-tokenization delimiter. When specified, no pieces crossing this delimiter is not included in the vocab. Then the delimiter string is virtually ignored during the training. This field can allows constraints on the vocabulary selection. Note that this field is available on unigram mode.
  optional string pretokenization_delimiter = 53 [default = ""];
  Specified by:
  
  getPretokenizationDelimiter in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The pretokenizationDelimiter.
- getPretokenizationDelimiterBytes
  
  public com.google.protobuf.ByteString getPretokenizationDelimiterBytes()
  Defines the pre-tokenization delimiter. When specified, no pieces crossing this delimiter is not included in the vocab. Then the delimiter string is virtually ignored during the training. This field can allows constraints on the vocabulary selection. Note that this field is available on unigram mode.
  optional string pretokenization_delimiter = 53 [default = ""];
  Specified by:
  
  getPretokenizationDelimiterBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for pretokenizationDelimiter.
- getControlSymbolsList
  
  public com.google.protobuf.ProtocolStringList getControlSymbolsList()
  ///////////////////////////////////////////////////////////////// Vocabulary management Defines control symbols used as an indicator to change the behavior of the decoder. <s> and </s> are pre-defined. We can use this field to encode various meta information, including language indicator in multilingual model. These symbols are not visible to users, but visible to the decoder. Note that when the input sentence contains control symbols, they are not treated as one token, but segmented into normal pieces. Control symbols must be inserted independently from the segmentation.
  repeated string control_symbols = 30;
  Specified by:
  
  getControlSymbolsList in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  A list containing the controlSymbols.
- getControlSymbolsCount
  
  public int getControlSymbolsCount()
  ///////////////////////////////////////////////////////////////// Vocabulary management Defines control symbols used as an indicator to change the behavior of the decoder. <s> and </s> are pre-defined. We can use this field to encode various meta information, including language indicator in multilingual model. These symbols are not visible to users, but visible to the decoder. Note that when the input sentence contains control symbols, they are not treated as one token, but segmented into normal pieces. Control symbols must be inserted independently from the segmentation.
  repeated string control_symbols = 30;
  Specified by:
  
  getControlSymbolsCount in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The count of controlSymbols.
- getControlSymbols
  
  public String getControlSymbols(int index)
  ///////////////////////////////////////////////////////////////// Vocabulary management Defines control symbols used as an indicator to change the behavior of the decoder. <s> and </s> are pre-defined. We can use this field to encode various meta information, including language indicator in multilingual model. These symbols are not visible to users, but visible to the decoder. Note that when the input sentence contains control symbols, they are not treated as one token, but segmented into normal pieces. Control symbols must be inserted independently from the segmentation.
  repeated string control_symbols = 30;
  Specified by:
  
  getControlSymbols in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Parameters:
  
  index - The index of the element to return.
  
  Returns:
  
  The controlSymbols at the given index.
- getControlSymbolsBytes
  
  public com.google.protobuf.ByteString getControlSymbolsBytes(int index)
  ///////////////////////////////////////////////////////////////// Vocabulary management Defines control symbols used as an indicator to change the behavior of the decoder. <s> and </s> are pre-defined. We can use this field to encode various meta information, including language indicator in multilingual model. These symbols are not visible to users, but visible to the decoder. Note that when the input sentence contains control symbols, they are not treated as one token, but segmented into normal pieces. Control symbols must be inserted independently from the segmentation.
  repeated string control_symbols = 30;
  Specified by:
  
  getControlSymbolsBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Parameters:
  
  index - The index of the value to return.
  
  Returns:
  
  The bytes of the controlSymbols at the given index.
- getUserDefinedSymbolsList
  
  public com.google.protobuf.ProtocolStringList getUserDefinedSymbolsList()
  Defines user defined symbols. These symbols are added with extremely high score so they are always treated as one unique symbol in any context. Typical usage of user_defined_symbols is placeholder for named entities.
  repeated string user_defined_symbols = 31;
  Specified by:
  
  getUserDefinedSymbolsList in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  A list containing the userDefinedSymbols.
- getUserDefinedSymbolsCount
  
  public int getUserDefinedSymbolsCount()
  Defines user defined symbols. These symbols are added with extremely high score so they are always treated as one unique symbol in any context. Typical usage of user_defined_symbols is placeholder for named entities.
  repeated string user_defined_symbols = 31;
  Specified by:
  
  getUserDefinedSymbolsCount in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The count of userDefinedSymbols.
- getUserDefinedSymbols
  
  public String getUserDefinedSymbols(int index)
  Defines user defined symbols. These symbols are added with extremely high score so they are always treated as one unique symbol in any context. Typical usage of user_defined_symbols is placeholder for named entities.
  repeated string user_defined_symbols = 31;
  Specified by:
  
  getUserDefinedSymbols in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Parameters:
  
  index - The index of the element to return.
  
  Returns:
  
  The userDefinedSymbols at the given index.
- getUserDefinedSymbolsBytes
  
  public com.google.protobuf.ByteString getUserDefinedSymbolsBytes(int index)
  Defines user defined symbols. These symbols are added with extremely high score so they are always treated as one unique symbol in any context. Typical usage of user_defined_symbols is placeholder for named entities.
  repeated string user_defined_symbols = 31;
  Specified by:
  
  getUserDefinedSymbolsBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Parameters:
  
  index - The index of the value to return.
  
  Returns:
  
  The bytes of the userDefinedSymbols at the given index.
- hasRequiredChars
  
  public boolean hasRequiredChars()
  Defines required characters. Each UTF8 character in this string is included in the character set regardless of character_coverage value. Unlike user_defined_symbols, these characters have scores based on the frequency on input sentences, and the model can form subwords using characters in this field.
  optional string required_chars = 36;
  Specified by:
  
  hasRequiredChars in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the requiredChars field is set.
- getRequiredChars
  
  public String getRequiredChars()
  Defines required characters. Each UTF8 character in this string is included in the character set regardless of character_coverage value. Unlike user_defined_symbols, these characters have scores based on the frequency on input sentences, and the model can form subwords using characters in this field.
  optional string required_chars = 36;
  Specified by:
  
  getRequiredChars in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The requiredChars.
- getRequiredCharsBytes
  
  public com.google.protobuf.ByteString getRequiredCharsBytes()
  Defines required characters. Each UTF8 character in this string is included in the character set regardless of character_coverage value. Unlike user_defined_symbols, these characters have scores based on the frequency on input sentences, and the model can form subwords using characters in this field.
  optional string required_chars = 36;
  Specified by:
  
  getRequiredCharsBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for requiredChars.
- hasByteFallback
  
  public boolean hasByteFallback()
  Decomposes unknown pieces into UTF-8 bytes.
  optional bool byte_fallback = 35 [default = false];
  Specified by:
  
  hasByteFallback in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the byteFallback field is set.
- getByteFallback
  
  public boolean getByteFallback()
  Decomposes unknown pieces into UTF-8 bytes.
  optional bool byte_fallback = 35 [default = false];
  Specified by:
  
  getByteFallback in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The byteFallback.
- hasVocabularyOutputPieceScore
  
  public boolean hasVocabularyOutputPieceScore()
  When creating the vocabulary file, defines whether or not to additionally output the score for each piece.
  optional bool vocabulary_output_piece_score = 32 [default = true];
  Specified by:
  
  hasVocabularyOutputPieceScore in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the vocabularyOutputPieceScore field is set.
- getVocabularyOutputPieceScore
  
  public boolean getVocabularyOutputPieceScore()
  When creating the vocabulary file, defines whether or not to additionally output the score for each piece.
  optional bool vocabulary_output_piece_score = 32 [default = true];
  Specified by:
  
  getVocabularyOutputPieceScore in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The vocabularyOutputPieceScore.
- hasHardVocabLimit
  
  public boolean hasHardVocabLimit()
  `vocab_size` is treated as hard limit. Crash if the model can not produce the vocab of size `vocab_size`, When `hard_vocab_limit` is false, vocab_size is treated as soft limit. Note that when model_type=char, always assumes hard_vocab_limit = false.
  optional bool hard_vocab_limit = 33 [default = true];
  Specified by:
  
  hasHardVocabLimit in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the hardVocabLimit field is set.
- getHardVocabLimit
  
  public boolean getHardVocabLimit()
  `vocab_size` is treated as hard limit. Crash if the model can not produce the vocab of size `vocab_size`, When `hard_vocab_limit` is false, vocab_size is treated as soft limit. Note that when model_type=char, always assumes hard_vocab_limit = false.
  optional bool hard_vocab_limit = 33 [default = true];
  Specified by:
  
  getHardVocabLimit in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The hardVocabLimit.
- hasUseAllVocab
  
  public boolean hasUseAllVocab()
  use all symbols for vocab extraction. This flag is valid if model type is either CHAR or WORD
  optional bool use_all_vocab = 34 [default = false];
  Specified by:
  
  hasUseAllVocab in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the useAllVocab field is set.
- getUseAllVocab
  
  public boolean getUseAllVocab()
  use all symbols for vocab extraction. This flag is valid if model type is either CHAR or WORD
  optional bool use_all_vocab = 34 [default = false];
  Specified by:
  
  getUseAllVocab in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The useAllVocab.
- hasUnkId
  
  public boolean hasUnkId()
  ///////////////////////////////////////////////////////////////// Reserved special meta tokens. * -1 is not used. * unk_id must not be -1. Id must starts with 0 and be contiguous.
  optional int32 unk_id = 40 [default = 0];
  Specified by:
  
  hasUnkId in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the unkId field is set.
- getUnkId
  
  public int getUnkId()
  ///////////////////////////////////////////////////////////////// Reserved special meta tokens. * -1 is not used. * unk_id must not be -1. Id must starts with 0 and be contiguous.
  optional int32 unk_id = 40 [default = 0];
  Specified by:
  
  getUnkId in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The unkId.
- hasBosId
  
  public boolean hasBosId()
  <s>
  optional int32 bos_id = 41 [default = 1];
  Specified by:
  
  hasBosId in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the bosId field is set.
- getBosId
  
  public int getBosId()
  <s>
  optional int32 bos_id = 41 [default = 1];
  Specified by:
  
  getBosId in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bosId.
- hasEosId
  
  public boolean hasEosId()
  </s>
  optional int32 eos_id = 42 [default = 2];
  Specified by:
  
  hasEosId in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the eosId field is set.
- getEosId
  
  public int getEosId()
  </s>
  optional int32 eos_id = 42 [default = 2];
  Specified by:
  
  getEosId in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The eosId.
- hasPadId
  
  public boolean hasPadId()
  <pad> (padding)
  optional int32 pad_id = 43 [default = -1];
  Specified by:
  
  hasPadId in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the padId field is set.
- getPadId
  
  public int getPadId()
  <pad> (padding)
  optional int32 pad_id = 43 [default = -1];
  Specified by:
  
  getPadId in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The padId.
- hasUnkPiece
  
  public boolean hasUnkPiece()
  
  optional string unk_piece = 45 [default = "<unk>"];
  
  Specified by:
  
  hasUnkPiece in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the unkPiece field is set.
- getUnkPiece
  
  public String getUnkPiece()
  
  optional string unk_piece = 45 [default = "<unk>"];
  
  Specified by:
  
  getUnkPiece in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The unkPiece.
- getUnkPieceBytes
  
  public com.google.protobuf.ByteString getUnkPieceBytes()
  
  optional string unk_piece = 45 [default = "<unk>"];
  
  Specified by:
  
  getUnkPieceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for unkPiece.
- hasBosPiece
  
  public boolean hasBosPiece()
  
  optional string bos_piece = 46 [default = "<s>"];
  
  Specified by:
  
  hasBosPiece in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the bosPiece field is set.
- getBosPiece
  
  public String getBosPiece()
  
  optional string bos_piece = 46 [default = "<s>"];
  
  Specified by:
  
  getBosPiece in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bosPiece.
- getBosPieceBytes
  
  public com.google.protobuf.ByteString getBosPieceBytes()
  
  optional string bos_piece = 46 [default = "<s>"];
  
  Specified by:
  
  getBosPieceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for bosPiece.
- hasEosPiece
  
  public boolean hasEosPiece()
  
  optional string eos_piece = 47 [default = "</s>"];
  
  Specified by:
  
  hasEosPiece in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the eosPiece field is set.
- getEosPiece
  
  public String getEosPiece()
  
  optional string eos_piece = 47 [default = "</s>"];
  
  Specified by:
  
  getEosPiece in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The eosPiece.
- getEosPieceBytes
  
  public com.google.protobuf.ByteString getEosPieceBytes()
  
  optional string eos_piece = 47 [default = "</s>"];
  
  Specified by:
  
  getEosPieceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for eosPiece.
- hasPadPiece
  
  public boolean hasPadPiece()
  
  optional string pad_piece = 48 [default = "<pad>"];
  
  Specified by:
  
  hasPadPiece in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the padPiece field is set.
- getPadPiece
  
  public String getPadPiece()
  
  optional string pad_piece = 48 [default = "<pad>"];
  
  Specified by:
  
  getPadPiece in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The padPiece.
- getPadPieceBytes
  
  public com.google.protobuf.ByteString getPadPieceBytes()
  
  optional string pad_piece = 48 [default = "<pad>"];
  
  Specified by:
  
  getPadPieceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for padPiece.
- hasUnkSurface
  
  public boolean hasUnkSurface()
  Encodes <unk> into U+2047 (DOUBLE QUESTION MARK), since this character can be useful both for user and developer. We can easily figure out that <unk> is emitted.
  optional string unk_surface = 44 [default = " \342\201\207 "];
  Specified by:
  
  hasUnkSurface in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the unkSurface field is set.
- getUnkSurface
  
  public String getUnkSurface()
  Encodes <unk> into U+2047 (DOUBLE QUESTION MARK), since this character can be useful both for user and developer. We can easily figure out that <unk> is emitted.
  optional string unk_surface = 44 [default = " \342\201\207 "];
  Specified by:
  
  getUnkSurface in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The unkSurface.
- getUnkSurfaceBytes
  
  public com.google.protobuf.ByteString getUnkSurfaceBytes()
  Encodes <unk> into U+2047 (DOUBLE QUESTION MARK), since this character can be useful both for user and developer. We can easily figure out that <unk> is emitted.
  optional string unk_surface = 44 [default = " \342\201\207 "];
  Specified by:
  
  getUnkSurfaceBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for unkSurface.
- hasTrainExtremelyLargeCorpus
  
  public boolean hasTrainExtremelyLargeCorpus()
  Increase bit depth to allow unigram model training on large (>10M sentences) corpora. A Side-effect of enabling this flag is increased memory usage.
  optional bool train_extremely_large_corpus = 49 [default = false];
  Specified by:
  
  hasTrainExtremelyLargeCorpus in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the trainExtremelyLargeCorpus field is set.
- getTrainExtremelyLargeCorpus
  
  public boolean getTrainExtremelyLargeCorpus()
  Increase bit depth to allow unigram model training on large (>10M sentences) corpora. A Side-effect of enabling this flag is increased memory usage.
  optional bool train_extremely_large_corpus = 49 [default = false];
  Specified by:
  
  getTrainExtremelyLargeCorpus in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The trainExtremelyLargeCorpus.
- hasSeedSentencepiecesFile
  
  public boolean hasSeedSentencepiecesFile()
  Path to a seed sentencepieces file, with one tab-separated seed sentencepiece <tab> frequency per line.
  optional string seed_sentencepieces_file = 54 [default = ""];
  Specified by:
  
  hasSeedSentencepiecesFile in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  Whether the seedSentencepiecesFile field is set.
- getSeedSentencepiecesFile
  
  public String getSeedSentencepiecesFile()
  Path to a seed sentencepieces file, with one tab-separated seed sentencepiece <tab> frequency per line.
  optional string seed_sentencepieces_file = 54 [default = ""];
  Specified by:
  
  getSeedSentencepiecesFile in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The seedSentencepiecesFile.
- getSeedSentencepiecesFileBytes
  
  public com.google.protobuf.ByteString getSeedSentencepiecesFileBytes()
  Path to a seed sentencepieces file, with one tab-separated seed sentencepiece <tab> frequency per line.
  optional string seed_sentencepieces_file = 54 [default = ""];
  Specified by:
  
  getSeedSentencepiecesFileBytes in interface SentencepieceModel.TrainerSpecOrBuilder
  
  Returns:
  
  The bytes for seedSentencepiecesFile.
- isInitialized
  
  public final boolean isInitialized()
  
  Specified by:
  
  isInitialized in interface com.google.protobuf.MessageLiteOrBuilder
  
  Overrides:
  
  isInitialized in class com.google.protobuf.GeneratedMessageV3.ExtendableMessage<SentencepieceModel.TrainerSpec>
- writeTo
  
  public void writeTo(com.google.protobuf.CodedOutputStream output) throws IOException
  
  Specified by:
  
  writeTo in interface com.google.protobuf.MessageLite
  
  Overrides:
  
  writeTo in class com.google.protobuf.GeneratedMessageV3
  
  Throws:
  
  IOException
- getSerializedSize
  
  public int getSerializedSize()
  
  Specified by:
  
  getSerializedSize in interface com.google.protobuf.MessageLite
  
  Overrides:
  
  getSerializedSize in class com.google.protobuf.GeneratedMessageV3
- equals
  
  public boolean equals(Object obj)
  
  Specified by:
  
  equals in interface com.google.protobuf.Message
  
  Overrides:
  
  equals in class com.google.protobuf.AbstractMessage
- hashCode
  
  public int hashCode()
  
  Specified by:
  
  hashCode in interface com.google.protobuf.Message
  
  Overrides:
  
  hashCode in class com.google.protobuf.AbstractMessage
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(ByteBuffer data) throws com.google.protobuf.InvalidProtocolBufferException
  
  Throws:
  
  com.google.protobuf.InvalidProtocolBufferException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(ByteBuffer data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException
  
  Throws:
  
  com.google.protobuf.InvalidProtocolBufferException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException
  
  Throws:
  
  com.google.protobuf.InvalidProtocolBufferException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException
  
  Throws:
  
  com.google.protobuf.InvalidProtocolBufferException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException
  
  Throws:
  
  com.google.protobuf.InvalidProtocolBufferException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException
  
  Throws:
  
  com.google.protobuf.InvalidProtocolBufferException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(InputStream input) throws IOException
  
  Throws:
  
  IOException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws IOException
  
  Throws:
  
  IOException
- parseDelimitedFrom
  
  public static SentencepieceModel.TrainerSpec parseDelimitedFrom(InputStream input) throws IOException
  
  Throws:
  
  IOException
- parseDelimitedFrom
  
  public static SentencepieceModel.TrainerSpec parseDelimitedFrom(InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws IOException
  
  Throws:
  
  IOException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(com.google.protobuf.CodedInputStream input) throws IOException
  
  Throws:
  
  IOException
- parseFrom
  
  public static SentencepieceModel.TrainerSpec parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws IOException
  
  Throws:
  
  IOException
- newBuilderForType
  
  public SentencepieceModel.TrainerSpec.Builder newBuilderForType()
  
  Specified by:
  
  newBuilderForType in interface com.google.protobuf.Message
  
  Specified by:
  
  newBuilderForType in interface com.google.protobuf.MessageLite
- newBuilder
  
  public static SentencepieceModel.TrainerSpec.Builder newBuilder()
- newBuilder
  
  public static SentencepieceModel.TrainerSpec.Builder newBuilder(SentencepieceModel.TrainerSpec prototype)
- toBuilder
  
  public SentencepieceModel.TrainerSpec.Builder toBuilder()
  
  Specified by:
  
  toBuilder in interface com.google.protobuf.Message
  
  Specified by:
  
  toBuilder in interface com.google.protobuf.MessageLite
- getDefaultInstance
  
  public static SentencepieceModel.TrainerSpec getDefaultInstance()
- parser
  
  public static com.google.protobuf.Parser<SentencepieceModel.TrainerSpec> parser()
- getParserForType
  
  public com.google.protobuf.Parser<SentencepieceModel.TrainerSpec> getParserForType()
  
  Specified by:
  
  getParserForType in interface com.google.protobuf.Message
  
  Specified by:
  
  getParserForType in interface com.google.protobuf.MessageLite
  
  Overrides:
  
  getParserForType in class com.google.protobuf.GeneratedMessageV3
- getDefaultInstanceForType
  
  public SentencepieceModel.TrainerSpec getDefaultInstanceForType()
  
  Specified by:
  
  getDefaultInstanceForType in interface com.google.protobuf.GeneratedMessageV3.ExtendableMessageOrBuilder<SentencepieceModel.TrainerSpec>
  
  Specified by:
  
  getDefaultInstanceForType in interface com.google.protobuf.MessageLiteOrBuilder
  
  Specified by:
  
  getDefaultInstanceForType in interface com.google.protobuf.MessageOrBuilder

Class SentencepieceModel.TrainerSpec

Nested Class Summary

Nested classes/interfaces inherited from class com.google.protobuf.GeneratedMessageV3

Field Summary

Method Summary

Methods inherited from class com.google.protobuf.GeneratedMessageV3.ExtendableMessage

Methods inherited from class com.google.protobuf.GeneratedMessageV3

Methods inherited from class com.google.protobuf.AbstractMessage

Methods inherited from class com.google.protobuf.AbstractMessageLite

Methods inherited from class java.lang.Object

Methods inherited from interface com.google.protobuf.GeneratedMessageV3.ExtendableMessageOrBuilder

Methods inherited from interface com.google.protobuf.MessageLite

Methods inherited from interface com.google.protobuf.MessageOrBuilder

Field Details

INPUT_FIELD_NUMBER

INPUT_FORMAT_FIELD_NUMBER

MODEL_PREFIX_FIELD_NUMBER

MODEL_TYPE_FIELD_NUMBER

VOCAB_SIZE_FIELD_NUMBER

ACCEPT_LANGUAGE_FIELD_NUMBER

SELF_TEST_SAMPLE_SIZE_FIELD_NUMBER

ENABLE_DIFFERENTIAL_PRIVACY_FIELD_NUMBER

DIFFERENTIAL_PRIVACY_NOISE_LEVEL_FIELD_NUMBER

DIFFERENTIAL_PRIVACY_CLIPPING_THRESHOLD_FIELD_NUMBER

CHARACTER_COVERAGE_FIELD_NUMBER

INPUT_SENTENCE_SIZE_FIELD_NUMBER

SHUFFLE_INPUT_SENTENCE_FIELD_NUMBER

MINING_SENTENCE_SIZE_FIELD_NUMBER

TRAINING_SENTENCE_SIZE_FIELD_NUMBER

SEED_SENTENCEPIECE_SIZE_FIELD_NUMBER

SHRINKING_FACTOR_FIELD_NUMBER

MAX_SENTENCE_LENGTH_FIELD_NUMBER

NUM_THREADS_FIELD_NUMBER

NUM_SUB_ITERATIONS_FIELD_NUMBER

MAX_SENTENCEPIECE_LENGTH_FIELD_NUMBER

SPLIT_BY_UNICODE_SCRIPT_FIELD_NUMBER

SPLIT_BY_NUMBER_FIELD_NUMBER

SPLIT_BY_WHITESPACE_FIELD_NUMBER

TREAT_WHITESPACE_AS_SUFFIX_FIELD_NUMBER

ALLOW_WHITESPACE_ONLY_PIECES_FIELD_NUMBER

SPLIT_DIGITS_FIELD_NUMBER

PRETOKENIZATION_DELIMITER_FIELD_NUMBER

CONTROL_SYMBOLS_FIELD_NUMBER

USER_DEFINED_SYMBOLS_FIELD_NUMBER

REQUIRED_CHARS_FIELD_NUMBER

BYTE_FALLBACK_FIELD_NUMBER

VOCABULARY_OUTPUT_PIECE_SCORE_FIELD_NUMBER

HARD_VOCAB_LIMIT_FIELD_NUMBER

USE_ALL_VOCAB_FIELD_NUMBER

UNK_ID_FIELD_NUMBER

BOS_ID_FIELD_NUMBER

EOS_ID_FIELD_NUMBER

PAD_ID_FIELD_NUMBER

UNK_PIECE_FIELD_NUMBER

BOS_PIECE_FIELD_NUMBER

EOS_PIECE_FIELD_NUMBER

PAD_PIECE_FIELD_NUMBER

UNK_SURFACE_FIELD_NUMBER

TRAIN_EXTREMELY_LARGE_CORPUS_FIELD_NUMBER

SEED_SENTENCEPIECES_FILE_FIELD_NUMBER

PARSER

Method Details

getDescriptor

getInputList

getInputCount

getInput

getInputBytes

hasInputFormat

getInputFormat

getInputFormatBytes

hasModelPrefix

getModelPrefix

getModelPrefixBytes

hasModelType

getModelType

hasVocabSize

getVocabSize

getAcceptLanguageList

getAcceptLanguageCount

getAcceptLanguage