Skip to content

Commit 2b08bde

Browse files
committed
Add GPT Tokenizer feature
1 parent 23aa920 commit 2b08bde

File tree

6 files changed

+365
-0
lines changed

6 files changed

+365
-0
lines changed

XCAChatGPT.xcodeproj/project.pbxproj

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
8B057623298FBE0400A56C9A /* DotLoadingView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B91C017298ADF4E0079AF26 /* DotLoadingView.swift */; };
3737
8B057624298FBE0400A56C9A /* ViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B91C013298ADC560079AF26 /* ViewModel.swift */; };
3838
8B05764829909A9200A56C9A /* ScrollView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B05764729909A9200A56C9A /* ScrollView.swift */; };
39+
8B612E2529D68CC9008DF5AF /* TextView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B612E2229D68CC9008DF5AF /* TextView.swift */; };
40+
8B612E2629D68CC9008DF5AF /* TokenizerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B612E2329D68CC9008DF5AF /* TokenizerView.swift */; };
41+
8B612E2729D68CC9008DF5AF /* TokenizerViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B612E2429D68CC9008DF5AF /* TokenizerViewModel.swift */; };
42+
8B612E2A29D68CE3008DF5AF /* GPTEncoder in Frameworks */ = {isa = PBXBuildFile; productRef = 8B612E2929D68CE3008DF5AF /* GPTEncoder */; };
3943
8B82463429B1F49F0069B8F7 /* ChatGPTAPIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B82463329B1F49F0069B8F7 /* ChatGPTAPIModels.swift */; };
4044
8B82463529B1F49F0069B8F7 /* ChatGPTAPIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B82463329B1F49F0069B8F7 /* ChatGPTAPIModels.swift */; };
4145
8B82463629B1F49F0069B8F7 /* ChatGPTAPIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8B82463329B1F49F0069B8F7 /* ChatGPTAPIModels.swift */; };
@@ -91,6 +95,9 @@
9195
8B057617298FBDB700A56C9A /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
9296
8B05761A298FBDB700A56C9A /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
9397
8B05764729909A9200A56C9A /* ScrollView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ScrollView.swift; sourceTree = "<group>"; };
98+
8B612E2229D68CC9008DF5AF /* TextView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TextView.swift; sourceTree = "<group>"; };
99+
8B612E2329D68CC9008DF5AF /* TokenizerView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TokenizerView.swift; sourceTree = "<group>"; };
100+
8B612E2429D68CC9008DF5AF /* TokenizerViewModel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TokenizerViewModel.swift; sourceTree = "<group>"; };
94101
8B82463329B1F49F0069B8F7 /* ChatGPTAPIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatGPTAPIModels.swift; sourceTree = "<group>"; };
95102
8B91C000298AD09E0079AF26 /* XCAChatGPT.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = XCAChatGPT.app; sourceTree = BUILT_PRODUCTS_DIR; };
96103
8B91C003298AD09E0079AF26 /* XCAChatGPTApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = XCAChatGPTApp.swift; sourceTree = "<group>"; };
@@ -130,6 +137,7 @@
130137
isa = PBXFrameworksBuildPhase;
131138
buildActionMask = 2147483647;
132139
files = (
140+
8B612E2A29D68CE3008DF5AF /* GPTEncoder in Frameworks */,
133141
);
134142
runOnlyForDeploymentPostprocessing = 0;
135143
};
@@ -234,6 +242,9 @@
234242
isa = PBXGroup;
235243
children = (
236244
8B91C003298AD09E0079AF26 /* XCAChatGPTApp.swift */,
245+
8B612E2229D68CC9008DF5AF /* TextView.swift */,
246+
8B612E2329D68CC9008DF5AF /* TokenizerView.swift */,
247+
8B612E2429D68CC9008DF5AF /* TokenizerViewModel.swift */,
237248
8B91C007298AD09F0079AF26 /* Assets.xcassets */,
238249
8B91C009298AD09F0079AF26 /* Preview Content */,
239250
);
@@ -332,6 +343,9 @@
332343
dependencies = (
333344
);
334345
name = XCAChatGPT;
346+
packageProductDependencies = (
347+
8B612E2929D68CE3008DF5AF /* GPTEncoder */,
348+
);
335349
productName = XCAChatGPT;
336350
productReference = 8B91C000298AD09E0079AF26 /* XCAChatGPT.app */;
337351
productType = "com.apple.product-type.application";
@@ -372,6 +386,9 @@
372386
Base,
373387
);
374388
mainGroup = 8B91BFF7298AD09E0079AF26;
389+
packageReferences = (
390+
8B612E2829D68CE3008DF5AF /* XCRemoteSwiftPackageReference "GPTEncoder" */,
391+
);
375392
productRefGroup = 8B91C001298AD09E0079AF26 /* Products */;
376393
projectDirPath = "";
377394
projectRoot = "";
@@ -484,10 +501,13 @@
484501
files = (
485502
8B91C012298AD0CE0079AF26 /* ChatGPTAPI.swift in Sources */,
486503
8B91C006298AD09E0079AF26 /* ContentView.swift in Sources */,
504+
8B612E2629D68CC9008DF5AF /* TokenizerView.swift in Sources */,
505+
8B612E2729D68CC9008DF5AF /* TokenizerViewModel.swift in Sources */,
487506
8B82463429B1F49F0069B8F7 /* ChatGPTAPIModels.swift in Sources */,
488507
8B91C014298ADC560079AF26 /* ViewModel.swift in Sources */,
489508
8B91C018298ADF4E0079AF26 /* DotLoadingView.swift in Sources */,
490509
8B91C004298AD09E0079AF26 /* XCAChatGPTApp.swift in Sources */,
510+
8B612E2529D68CC9008DF5AF /* TextView.swift in Sources */,
491511
8B91C01A298ADF7F0079AF26 /* MessageRowView.swift in Sources */,
492512
8B91C016298ADC9D0079AF26 /* MessageRow.swift in Sources */,
493513
);
@@ -938,6 +958,25 @@
938958
defaultConfigurationName = Release;
939959
};
940960
/* End XCConfigurationList section */
961+
962+
/* Begin XCRemoteSwiftPackageReference section */
963+
8B612E2829D68CE3008DF5AF /* XCRemoteSwiftPackageReference "GPTEncoder" */ = {
964+
isa = XCRemoteSwiftPackageReference;
965+
repositoryURL = "https://github.com/alfianlosari/GPTEncoder.git";
966+
requirement = {
967+
kind = upToNextMajorVersion;
968+
minimumVersion = 1.0.0;
969+
};
970+
};
971+
/* End XCRemoteSwiftPackageReference section */
972+
973+
/* Begin XCSwiftPackageProductDependency section */
974+
8B612E2929D68CE3008DF5AF /* GPTEncoder */ = {
975+
isa = XCSwiftPackageProductDependency;
976+
package = 8B612E2829D68CE3008DF5AF /* XCRemoteSwiftPackageReference "GPTEncoder" */;
977+
productName = GPTEncoder;
978+
};
979+
/* End XCSwiftPackageProductDependency section */
941980
};
942981
rootObject = 8B91BFF8298AD09E0079AF26 /* Project object */;
943982
}

XCAChatGPT.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

XCAChatGPT/TextView.swift

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
//
2+
// TextView.swift
3+
// XCAChatGPT
4+
//
5+
// Created by Alfian Losari on 28/03/23.
6+
//
7+
8+
import Foundation
9+
import SwiftUI
10+
11+
struct TextView: UIViewRepresentable {
12+
13+
let colors = [
14+
UIColor(red: 199/255, green: 195/255, blue: 212/255, alpha: 1),
15+
UIColor(red: 202/255, green: 236/255, blue: 202/255, alpha: 1),
16+
UIColor(red: 241/255, green: 218/255, blue: 181/255, alpha: 1),
17+
UIColor(red: 236/255, green: 180/255, blue: 180/255, alpha: 1),
18+
UIColor(red: 183/255, green: 219/255, blue: 241/255, alpha: 1)
19+
]
20+
21+
let output: TokenOutput
22+
23+
func updateUIView(_ textView: UITextView, context: Context) {
24+
let attributedText = NSMutableAttributedString()
25+
output.stringTokens.enumerated().forEach { index, string in
26+
let attributes: [NSAttributedString.Key: Any] = [
27+
.font: UIFont.preferredFont(forTextStyle: .body),
28+
.kern: 1,
29+
.backgroundColor: colors[index % colors.count],
30+
]
31+
32+
let attributedTokenText = NSAttributedString(string: string, attributes: attributes)
33+
attributedText.append(attributedTokenText)
34+
35+
}
36+
textView.attributedText = attributedText
37+
}
38+
39+
func makeUIView(context: Context) -> UITextView {
40+
let tv = UITextView()
41+
tv.isEditable = false
42+
return tv
43+
}
44+
45+
}

XCAChatGPT/TokenizerView.swift

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
//
2+
// TokenizerView.swift
3+
// XCAChatGPT
4+
//
5+
// Created by Alfian Losari on 28/03/23.
6+
//
7+
8+
import SwiftUI
9+
10+
struct TokenizerView: View {
11+
12+
@StateObject var vm = TokenizerViewModel()
13+
@FocusState private var isFocused: Bool
14+
15+
var body: some View {
16+
List {
17+
inputSection
18+
outputSection
19+
}
20+
.navigationTitle("GPT Tokenizer")
21+
}
22+
23+
var inputSection: some View {
24+
Section {
25+
TextField("Enter text to tokenize", text: $vm.inputText, axis: .vertical)
26+
.focused($isFocused)
27+
.lineLimit(4...12)
28+
.toolbar {
29+
ToolbarItem(placement: .keyboard) {
30+
HStack {
31+
Spacer()
32+
Button("Done") {
33+
isFocused = false
34+
}
35+
}
36+
}
37+
}
38+
39+
HStack {
40+
Button("Clear") {
41+
withAnimation {
42+
vm.inputText = ""
43+
}
44+
}
45+
.buttonStyle(.borderedProminent)
46+
.disabled(vm.inputText.isEmpty)
47+
48+
Button("Show example") {
49+
withAnimation {
50+
vm.inputText = exampleText
51+
isFocused = false
52+
}
53+
54+
}
55+
.buttonStyle(.borderedProminent)
56+
.disabled(vm.inputText == exampleText)
57+
58+
Spacer()
59+
60+
if vm.isTokenizing {
61+
ProgressView()
62+
}
63+
64+
}
65+
.padding(.vertical, 2)
66+
} header: {
67+
Text("Input")
68+
}
69+
70+
}
71+
72+
var outputSection: some View {
73+
Section {
74+
if let output = vm.output {
75+
VStack(alignment: .leading) {
76+
HStack {
77+
VStack {
78+
Text("Tokens").font(.subheadline)
79+
Text("\(output.tokens.count)").font(.headline)
80+
}
81+
82+
Divider()
83+
.frame(height: 32)
84+
85+
VStack {
86+
Text("Characters").font(.subheadline)
87+
Text("\(output.text.count)").font(.headline)
88+
}
89+
}
90+
.frame(maxWidth: .infinity, alignment: .center)
91+
92+
Picker("Output Type", selection: $vm.outputType) {
93+
Text("Text").tag(OutputType.text)
94+
Text("Token Ids").tag(OutputType.tokenIds)
95+
}
96+
.pickerStyle(.segmented)
97+
98+
switch vm.outputType {
99+
case .text:
100+
TextView(output: output)
101+
.frame(height: 240)
102+
103+
case .tokenIds:
104+
Text("\(output.tokens.description)")
105+
.textSelection(.enabled)
106+
.multilineTextAlignment(.leading)
107+
.padding(.vertical)
108+
}
109+
}
110+
}
111+
} header: {
112+
if vm.output != nil {
113+
Text("Output")
114+
}
115+
} footer: {
116+
Text(footerText).padding(.top, vm.output != nil ? 8 : 0)
117+
}
118+
}
119+
}
120+
121+
struct TokenizerView_Previews: PreviewProvider {
122+
static var previews: some View {
123+
NavigationStack {
124+
TokenizerView()
125+
}
126+
}
127+
}
128+
129+
130+
131+
let exampleText = """
132+
Many words map to one token, but some don't: indivisible.
133+
134+
Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾
135+
136+
Sequences of characters commonly found next to each other may be grouped together: 1234567890
137+
"""
138+
139+
140+
let footerText = """
141+
The GPT family of models process text using tokens, which are common sequences of characters found in text. The models understand the statistical relationships between these tokens, and excel at producing the next token in a sequence of tokens.
142+
143+
You can use this tool to understand how a piece of text would be tokenized by the API, and the total count of tokens in that piece of text.
144+
145+
A helpful rule of thumb is that one token generally corresponds to ~4 characters of text for common English text. This translates to roughly ¾ of a word (so 100 tokens ~= 75 words).
146+
147+
if your input contains one or more unicode characters that map to multiple tokens. The output visualization may display the bytes in each token in a non-standard way.
148+
149+
If you need a programmatic interface for tokenizing text, check out the GPTEncoder SPM or Cocoapods lib for Swift.
150+
"""
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
//
2+
// TokenizerViewModel.swift
3+
// XCAChatGPT
4+
//
5+
// Created by Alfian Losari on 28/03/23.
6+
//
7+
8+
import Combine
9+
import Foundation
10+
import SwiftUI
11+
import GPTEncoder
12+
13+
struct TokenOutput {
14+
let text: String
15+
let stringTokens: [String]
16+
let tokens: [Int]
17+
}
18+
19+
enum OutputType: Identifiable {
20+
case text, tokenIds
21+
var id: Self { self }
22+
}
23+
24+
class TokenizerViewModel: ObservableObject, @unchecked Sendable {
25+
26+
let tokenizer = GPTEncoder()
27+
28+
@Published var inputText = ""
29+
@Published var output: TokenOutput?
30+
@Published var isTokenizing = false
31+
@Published var error: String?
32+
@Published var isShowingError = false
33+
@Published var outputType = OutputType.text
34+
35+
var cancellables = Set<AnyCancellable>()
36+
var task: Task<(), Never>?
37+
38+
init() {
39+
startObserve()
40+
}
41+
42+
func startObserve() {
43+
$inputText
44+
.filter { !$0.isEmpty }
45+
.debounce(for: 0.3, scheduler: DispatchQueue.main)
46+
.sink { [weak self] value in
47+
guard let self = self else { return }
48+
self.task?.cancel()
49+
if self.inputText.isEmpty {
50+
return
51+
}
52+
self.task = Task { await self.tokenize(value: value) }
53+
}.store(in: &cancellables)
54+
55+
56+
$inputText
57+
.filter { $0.isEmpty }
58+
.sink { [weak self] _ in
59+
withAnimation { self?.output = nil }
60+
}.store(in: &cancellables)
61+
}
62+
63+
func tokenize(value: String) async {
64+
if Task.isCancelled { return }
65+
66+
Task { @MainActor [weak self] in
67+
withAnimation {
68+
self?.isTokenizing = true
69+
}
70+
}
71+
72+
let tokens = self.tokenizer.encode(text: value)
73+
let stringTokens = tokens.map { tokenizer.decode(tokens: [$0]) }
74+
75+
Task { @MainActor [weak self] in
76+
if Task.isCancelled { return }
77+
withAnimation {
78+
self?.output = .init(text: value, stringTokens: stringTokens, tokens: tokens)
79+
self?.isTokenizing = false
80+
}
81+
}
82+
}
83+
84+
}

0 commit comments

Comments
 (0)