feat(main): 新增 FridaNative、LM 和 RegisterTray 模块
- 添加 FridaNative模块,用于 Frida 相关的 native 代码 - 添加 LM 模块,用于 llama模型相关的 native 代码 - 添加 RegisterTray 模块,用于注册系统托盘图标和相关操作 - 新建对应的头文件、源文件和项目配置文件
This commit is contained in:
269
src/main/Cpp/LM/tiktoken.h
Normal file
269
src/main/Cpp/LM/tiktoken.h
Normal file
@@ -0,0 +1,269 @@
|
||||
#pragma once
|
||||
|
||||
#include <re2/re2.h>
|
||||
#include "unordered_dense.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <limits>
|
||||
#include <optional>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
namespace tiktoken {
|
||||
|
||||
static auto _byte_pair_merge(
|
||||
const std::string &piece,
|
||||
const ankerl::unordered_dense::map<std::string, int> &ranks,
|
||||
std::function<int (int, int)> func
|
||||
) -> std::vector<int> {
|
||||
std::vector<std::pair<int, int>> parts;
|
||||
parts.reserve(piece.size() + 1);
|
||||
for (auto idx = 0U; idx < piece.size() + 1; ++idx) {
|
||||
parts.emplace_back(idx, std::numeric_limits<int>::max());
|
||||
}
|
||||
|
||||
auto get_rank = [&piece, &ranks](
|
||||
const std::vector<std::pair<int, int>> &parts,
|
||||
int start_idx,
|
||||
int skip
|
||||
) -> std::optional<int> {
|
||||
if (start_idx + skip + 2 < parts.size()) {
|
||||
auto s = parts[start_idx].first;
|
||||
auto e = parts[start_idx + skip + 2].first;
|
||||
auto key = piece.substr(s, e - s);
|
||||
auto iter = ranks.find(key);
|
||||
if (iter != ranks.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
};
|
||||
|
||||
for (auto i = 0U; i < parts.size() - 2; ++i) {
|
||||
auto rank = get_rank(parts, i, 0);
|
||||
if (rank) {
|
||||
assert(*rank != std::numeric_limits<int>::max());
|
||||
parts[i].second = *rank;
|
||||
}
|
||||
}
|
||||
|
||||
while (true) {
|
||||
if (parts.size() == 1) break;
|
||||
|
||||
auto min_rank = std::make_pair<int, int>(std::numeric_limits<int>::max(), 0);
|
||||
for (auto i = 0U; i < parts.size() - 1; ++i) {
|
||||
auto rank = parts[i].second;
|
||||
if (rank < min_rank.first) {
|
||||
min_rank = { rank, i };
|
||||
}
|
||||
}
|
||||
|
||||
if (min_rank.first != std::numeric_limits<int>::max()) {
|
||||
auto i = min_rank.second;
|
||||
auto rank = get_rank(parts, i, 1);
|
||||
if (rank) {
|
||||
parts[i].second = *rank;
|
||||
} else {
|
||||
parts[i].second = std::numeric_limits<int>::max();
|
||||
}
|
||||
if (i > 0) {
|
||||
auto rank = get_rank(parts, i - 1, 1);
|
||||
if (rank) {
|
||||
parts[i - 1].second = *rank;
|
||||
} else {
|
||||
parts[i - 1].second = std::numeric_limits<int>::max();
|
||||
}
|
||||
}
|
||||
|
||||
parts.erase(parts.begin() + (i + 1));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
std::vector<int> out;
|
||||
out.reserve(parts.size() - 1);
|
||||
for (auto i = 0U; i < parts.size() - 1; ++i) {
|
||||
out.push_back(func(parts[i].first, parts[i + 1].first));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static auto byte_pair_encode(
|
||||
const std::string &piece,
|
||||
const ankerl::unordered_dense::map<std::string, int> &ranks
|
||||
) -> std::vector<int> {
|
||||
if (piece.size() == 1) {
|
||||
return {ranks.at(piece)};
|
||||
}
|
||||
|
||||
auto func = [&piece, &ranks](int start, int stop) -> int {
|
||||
std::string key = piece.substr(start, stop - start);
|
||||
return ranks.at(key);
|
||||
};
|
||||
|
||||
return _byte_pair_merge(piece, ranks, func);
|
||||
}
|
||||
|
||||
class tiktoken {
|
||||
public:
|
||||
tiktoken() = default;
|
||||
tiktoken(
|
||||
ankerl::unordered_dense::map<std::string, int> encoder,
|
||||
ankerl::unordered_dense::map<std::string, int> special_encoder,
|
||||
const std::string &pattern
|
||||
) {
|
||||
regex_ = std::make_unique<re2::RE2>("(" + pattern + ")");
|
||||
|
||||
std::string special_pattern;
|
||||
for (const auto &item : special_encoder) {
|
||||
if (!special_pattern.empty()) {
|
||||
special_pattern += "|";
|
||||
}
|
||||
special_pattern += re2::RE2::QuoteMeta(item.first);
|
||||
}
|
||||
if (special_pattern.empty()) {
|
||||
special_regex_ = nullptr;
|
||||
} else {
|
||||
special_regex_ = std::make_unique<re2::RE2>("(" + special_pattern + ")");
|
||||
}
|
||||
|
||||
encoder_ = std::move(encoder);
|
||||
special_tokens_encoder = std::move(special_encoder);
|
||||
|
||||
for (const auto &[k, v] : encoder_) {
|
||||
decoder_.emplace(v, k);
|
||||
}
|
||||
assert(encoder_.size() != decoder_.size() && "Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?");
|
||||
|
||||
for (const auto &[k, v] : special_tokens_encoder) {
|
||||
special_tokens_decoder.emplace(v, k);
|
||||
}
|
||||
}
|
||||
|
||||
auto encode_ordinary(const std::string &text) const -> std::vector<int> {
|
||||
return _encode_ordinary_native(text);
|
||||
}
|
||||
|
||||
auto encode(const std::string &text) const -> std::vector<int> {
|
||||
return _encode_native(text, special_tokens_encoder).first;
|
||||
}
|
||||
|
||||
auto encode_single_piece(const std::string &text) const -> std::vector<int> {
|
||||
auto iter = encoder_.find(text);
|
||||
if (iter != encoder_.end()) {
|
||||
return {iter->second};
|
||||
}
|
||||
return byte_pair_encode(text, encoder_);
|
||||
}
|
||||
|
||||
auto decode(const std::vector<int> &tokens) const -> std::string {
|
||||
return _decode_native(tokens);
|
||||
}
|
||||
|
||||
private:
|
||||
auto split_with_allowed_special_token(
|
||||
re2::StringPiece &input,
|
||||
const ankerl::unordered_dense::map<std::string, int> &allowed_special
|
||||
) const -> std::pair<std::optional<std::string>, re2::StringPiece> {
|
||||
if (special_regex_ == nullptr) return { std::nullopt, input };
|
||||
|
||||
auto start = input.begin();
|
||||
std::string special;
|
||||
while (true) {
|
||||
if (!re2::RE2::FindAndConsume(&input, *special_regex_, &special)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (allowed_special.count(special) == 1) {
|
||||
return { std::move(special), re2::StringPiece(start, input.begin() - start - special.size()) };
|
||||
}
|
||||
}
|
||||
|
||||
return { std::nullopt, input };
|
||||
}
|
||||
|
||||
auto _encode_ordinary_native(const std::string &text) const -> std::vector<int> {
|
||||
std::vector<int> ret;
|
||||
re2::StringPiece input(text);
|
||||
|
||||
std::string piece;
|
||||
while (re2::RE2::FindAndConsume(&input, *regex_, &piece)) {
|
||||
auto iter = encoder_.find(piece);
|
||||
if (iter != encoder_.end()) {
|
||||
ret.push_back(iter->second);
|
||||
continue;
|
||||
}
|
||||
auto tokens = byte_pair_encode(piece, encoder_);
|
||||
ret.insert(ret.end(), tokens.begin(), tokens.end());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
auto _encode_native(
|
||||
const std::string &text,
|
||||
const ankerl::unordered_dense::map<std::string, int> &allowed_special
|
||||
) const -> std::pair<std::vector<int>, int> {
|
||||
std::vector<int> ret;
|
||||
int last_piece_token_len = 0;
|
||||
re2::StringPiece input(text);
|
||||
|
||||
while (true) {
|
||||
auto [special, sub_input] = split_with_allowed_special_token(input, allowed_special);
|
||||
std::string piece;
|
||||
while (re2::RE2::FindAndConsume(&sub_input, *regex_, &piece)) {
|
||||
auto iter = encoder_.find(piece);
|
||||
if (iter != encoder_.end()) {
|
||||
last_piece_token_len = 1;
|
||||
ret.push_back(iter->second);
|
||||
continue;
|
||||
}
|
||||
auto tokens = byte_pair_encode(piece, encoder_);
|
||||
last_piece_token_len = tokens.size();
|
||||
ret.insert(ret.end(), tokens.begin(), tokens.end());
|
||||
}
|
||||
|
||||
if (special) {
|
||||
int token = special_tokens_encoder.at(*special);
|
||||
ret.push_back(token);
|
||||
last_piece_token_len = 0;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return { ret, last_piece_token_len };
|
||||
}
|
||||
|
||||
auto _decode_native(const std::vector<int> &tokens) const -> std::string {
|
||||
std::string ret;
|
||||
ret.reserve(tokens.size() * 2);
|
||||
for (auto token : tokens) {
|
||||
std::string token_bytes;
|
||||
auto iter = decoder_.find(token);
|
||||
if (iter != decoder_.end()) {
|
||||
token_bytes = iter->second;
|
||||
} else {
|
||||
iter = special_tokens_decoder.find(token);
|
||||
if (iter != special_tokens_decoder.end()) {
|
||||
token_bytes = iter->second;
|
||||
} else {
|
||||
throw std::runtime_error("unknown token: " + std::to_string(token));
|
||||
}
|
||||
}
|
||||
ret += token_bytes;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
ankerl::unordered_dense::map<std::string, int> encoder_;
|
||||
ankerl::unordered_dense::map<std::string, int> special_tokens_encoder;
|
||||
ankerl::unordered_dense::map<int, std::string> decoder_;
|
||||
ankerl::unordered_dense::map<int, std::string> special_tokens_decoder;
|
||||
std::unique_ptr<re2::RE2> regex_;
|
||||
std::unique_ptr<re2::RE2> special_regex_;
|
||||
};
|
||||
|
||||
} // namespace tiktoken
|
||||
Reference in New Issue
Block a user