-
Notifications
You must be signed in to change notification settings - Fork 118
Expand file tree
/
Copy pathtokenizers_c.h
More file actions
62 lines (45 loc) · 2.18 KB
/
tokenizers_c.h
File metadata and controls
62 lines (45 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/*!
* Copyright (c) 2023 by Contributors
* \file tokenizers_c.h
* \brief C binding to tokenizers rust library
*/
#ifndef TOKENIZERS_C_H_
#define TOKENIZERS_C_H_
// The C API
#ifdef __cplusplus
extern "C" {
#endif
#include <stddef.h>
#include <stdint.h>
typedef void* TokenizerHandle;
typedef struct {
int* token_ids;
size_t len;
} TokenizerEncodeResult;
TokenizerHandle tokenizers_new_from_str(const char* json, size_t len);
TokenizerHandle byte_level_bpe_tokenizers_new_from_str(const char* vocab, size_t vocab_len,
const char* merges, size_t merges_len,
const char* added_tokens,
size_t added_tokens_len);
void tokenizers_encode(TokenizerHandle handle, const char* data, size_t len, int add_special_token,
TokenizerEncodeResult* result);
void tokenizers_encode_batch(TokenizerHandle handle, const char** data, size_t* len,
size_t num_seqs, int add_special_token,
TokenizerEncodeResult* results);
void tokenizers_encode_batch_with_mask(TokenizerHandle handle, const char** data, size_t* len,
size_t num_seqs, int add_special_token,
TokenizerEncodeResult* results,
TokenizerEncodeResult* masks);
void tokenizers_free_encode_results(TokenizerEncodeResult* results, size_t num_seqs);
void tokenizers_decode(TokenizerHandle handle, const uint32_t* data, size_t len,
int skip_special_token);
void tokenizers_get_decode_str(TokenizerHandle handle, const char** data, size_t* len);
void tokenizers_get_vocab_size(TokenizerHandle handle, size_t* size);
void tokenizers_id_to_token(TokenizerHandle handle, uint32_t id, const char** data, size_t* len);
// tokenizers_token_to_id stores -1 to *id if the token is not in the vocab
void tokenizers_token_to_id(TokenizerHandle handle, const char* token, size_t len, int32_t* id);
void tokenizers_free(TokenizerHandle handle);
#ifdef __cplusplus
}
#endif
#endif // TOKENIZERS_C_H_