diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index e2a903d96e..1b570c822b 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -188,3 +188,8 @@ stringifying hɛloʊ wɜːld bielik +nemotron +BIOES +viterbi +argmaxes +unpadded diff --git a/.eslintrc.js b/.eslintrc.js index f95187b46b..8cb84b9ff8 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -13,6 +13,7 @@ const VALID_CATEGORIES = [ 'Models - Semantic Segmentation', 'Models - Speech To Text', 'Models - Style Transfer', + 'Models - Privacy Filter', 'Models - Text Embeddings', 'Models - Text to Speech', 'Models - VLM', diff --git a/.gitignore b/.gitignore index d481899882..86ce3a9042 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,7 @@ packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/mo Makefile *.pte +.agents +.claude +skills-lock.json + diff --git a/apps/llm/app/_layout.tsx b/apps/llm/app/_layout.tsx index e5dfc198a5..fc28a9d1b1 100644 --- a/apps/llm/app/_layout.tsx +++ b/apps/llm/app/_layout.tsx @@ -146,6 +146,14 @@ export default function _layout() { headerTitleStyle: { color: ColorPalette.primary }, }} /> + ); diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx index c7d5bae220..72358ae72c 100644 --- a/apps/llm/app/index.tsx +++ b/apps/llm/app/index.tsx @@ -41,6 +41,12 @@ export default function Home() { > Multimodal LLM (VLM) + router.navigate('privacy_filter/')} + > + Privacy Filter (PII) + ); diff --git a/apps/llm/app/privacy_filter/index.tsx b/apps/llm/app/privacy_filter/index.tsx new file mode 100644 index 0000000000..e6541dfe52 --- /dev/null +++ b/apps/llm/app/privacy_filter/index.tsx @@ -0,0 +1,251 @@ +import { useMemo, useState } from 'react'; +import { + ActivityIndicator, + ScrollView, + StyleSheet, + Text, + TouchableOpacity, + View, +} from 'react-native'; +import { useIsFocused } from '@react-navigation/native'; +import { useSafeAreaInsets } from 'react-native-safe-area-context'; +import { + PiiEntity, + PRIVACY_FILTER_NEMOTRON, + PRIVACY_FILTER_OPENAI, + PrivacyFilterModelSources, + usePrivacyFilter, +} from 'react-native-executorch'; +import ColorPalette from '../../colors'; +import { ModelOption, ModelPicker } from '../../components/ModelPicker'; +import { + buildSegments, + colorForLabel, + matchEntities, +} from '../../utils/piiMatching'; + +/* cspell:disable */ +// Sample tuned for the OpenAI base model — exercises the 8 entity types it +// recognizes (person, email, phone, account_number, address, date, url, +// secret). +const OPENAI_SAMPLE = `My name is Sarah Chen and I work as a senior engineer at Acme Corp. You can reach me at sarah.chen@acmecorp.io or call my direct line at (415) 923-0847. For billing inquiries, my account number is ACC-8821-4490-3371. + +I've been living at 17 Birchwood Lane, Portland, OR 97201 since October 3rd, 2019. Before that I was at 8 Rue de Rivoli, Paris, 75001, France. My personal website is https://sarahchen.dev and my GitHub is https://github.com/schen-eng. Feel free to connect — I usually respond within a business day. + +My date of birth is June 12, 1991, and my backup email is s.chen.personal@gmail.com in case the primary address is unreachable. This message also contains a confidential API key: sk-T93kXpLm2NvBqR7dYwZ4. Please do not share it outside the team. You can also reach my colleague James Okonkwo at j.okonkwo@acmecorp.io or at his mobile +44 7911 123456.`; +// Sample tuned for the OpenMed Nemotron model — covers categories the base +// OpenAI model doesn't have (medical, financial, technical, demographic). + +const NEMOTRON_SAMPLE = `Patient intake for Maria Lopez, female, age 47, blood type O+, born 1978-05-12. MRN 994-2210-AB; health plan beneficiary number HPBN-552-9931 with Aetna. SSN 412-55-7821, national ID DNI 88-7762-X. Primary occupation: registered nurse, currently employed full-time at Mercy General. Religion: Catholic; political view: independent. + +Reach her at maria.lopez@example.com or +1 (415) 555-0142. Mailing address: 84 Cedar Hill Road, Apt 3B, Berkeley, CA 94703, United States. Vehicle plate 7XKL922; driver license CA-D1294883. + +Payment for last visit: Visa ending 4992-1133-7820-4419, expires 11/28, CVV 884. Bank routing 021000089, SWIFT BIC CHASUS33. Employer EIN tax ID 47-3320118. Customer ID CUST-553201, employee ID EMP-A0093. + +Workstation MAC 3C:22:FB:8E:01:9A, IPv4 10.0.42.118, device IMEI 359888061234560. Service account API key sk-live-Tn8x3pLm2NvBqR7dYwZ4QF, password Hunter2!Spring. Session cookie sid=eyJ1c2VyIjoiOTk0MjIxMCJ9.`; +/* cspell:enable */ + +const MODEL_OPTIONS: ModelOption[] = [ + { label: 'OpenAI Privacy Filter (8 entities)', value: PRIVACY_FILTER_OPENAI }, + { + label: 'OpenMed Nemotron (55 entities)', + value: PRIVACY_FILTER_NEMOTRON, + }, +]; + +// Pick the right sample to display/run based on the active model. +function sampleFor(model: PrivacyFilterModelSources): string { + return model.modelName === PRIVACY_FILTER_NEMOTRON.modelName + ? NEMOTRON_SAMPLE + : OPENAI_SAMPLE; +} + +function HighlightedText({ + source, + entities, +}: { + source: string; + entities: PiiEntity[]; +}) { + const segments = useMemo( + () => buildSegments(source, matchEntities(source, entities)), + [source, entities] + ); + return ( + + {segments.map((seg, i) => + seg.label ? ( + + {seg.text} + + ) : ( + {seg.text} + ) + )} + + ); +} + +function PrivacyFilterScreen() { + const { bottom } = useSafeAreaInsets(); + const [entities, setEntities] = useState(null); + const [runError, setRunError] = useState(null); + const [inferenceMs, setInferenceMs] = useState(null); + const [selectedModel, setSelectedModel] = useState( + PRIVACY_FILTER_OPENAI + ); + + const filter = usePrivacyFilter({ model: selectedModel }); + const sampleText = sampleFor(selectedModel); + + const onRun = async () => { + setRunError(null); + setEntities(null); + setInferenceMs(null); + const startedAt = Date.now(); + try { + const result = await filter.generate(sampleText); + const elapsed = Date.now() - startedAt; + setInferenceMs(elapsed); + setEntities(result); + } catch (e) { + setRunError(e instanceof Error ? e.message : String(e)); + } + }; + + const disabled = !filter.isReady || filter.isGenerating; + + return ( + + { + setEntities(null); + setRunError(null); + setInferenceMs(null); + setSelectedModel(m); + }} + label="Model" + disabled={filter.isGenerating} + /> + + {filter.error && ( + + + Load error: {filter.error.message} + + + )} + + {!filter.isReady && !filter.error && ( + + + + Downloading model…{' '} + {Math.round((filter.downloadProgress ?? 0) * 100)}% + + + )} + + + {entities ? ( + + ) : ( + {sampleText} + )} + + + + {filter.isGenerating ? ( + + ) : ( + + Detect PII + {inferenceMs !== null && ` · ${inferenceMs} ms`} + + )} + + + {runError && ( + + Run error: {runError} + + )} + + ); +} + +export default function PrivacyFilterScreenWrapper() { + const isFocused = useIsFocused(); + return isFocused ? : null; +} + +const styles = StyleSheet.create({ + container: { + flex: 1, + padding: 16, + backgroundColor: '#fff', + gap: 10, + }, + textBox: { + flex: 1, + borderWidth: 1, + borderColor: '#e0e0e0', + borderRadius: 8, + padding: 10, + }, + sampleText: { + fontSize: 13, + color: '#222', + lineHeight: 19, + }, + highlight: { + fontWeight: '600', + borderRadius: 3, + }, + runButton: { + backgroundColor: ColorPalette.primary, + borderRadius: 8, + paddingVertical: 12, + alignItems: 'center', + }, + runButtonText: { + color: '#fff', + fontSize: 15, + fontWeight: '600', + }, + buttonDisabled: { + opacity: 0.5, + }, + centerBlock: { + alignItems: 'center', + gap: 6, + paddingVertical: 8, + }, + muted: { + color: '#666', + fontSize: 12, + }, + errorBanner: { + backgroundColor: '#fdecea', + borderColor: '#f5c6cb', + borderWidth: 1, + borderRadius: 6, + padding: 8, + }, + errorText: { + color: '#a94442', + fontSize: 12, + }, +}); diff --git a/apps/llm/utils/piiMatching.ts b/apps/llm/utils/piiMatching.ts new file mode 100644 index 0000000000..1f4a8adf1e --- /dev/null +++ b/apps/llm/utils/piiMatching.ts @@ -0,0 +1,132 @@ +import { PiiEntity } from 'react-native-executorch'; + +/** + * A detected entity span pinned to a character range in the source text. + */ +export interface EntityMatch { + start: number; + end: number; + label: string; +} + +const escapeRegex = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + +// Word-boundary-anchored search so e.g. "John" doesn't match the "John" +// inside "Johnson". +function findWordBounded(source: string, needle: string, from: number): number { + const re = new RegExp(`\\b${escapeRegex(needle)}\\b`); + const match = re.exec(source.slice(from)); + return match ? from + match.index : -1; +} + +/** + * Map detected entities (which carry decoded text only) onto character + * ranges in the source text by scanning forward. + * + * The native runner's `text` field is the BPE-detokenized form of the + * entity, which often differs from the source by whitespace, punctuation + * spacing, or stripped specials. Strategy: + * 1) Try a word-bounded match for the whole entity from the cursor onward. + * 2) On miss, fall back to each non-trivial word from the entity text + * individually so we still highlight most of the span. + * + * Order-preserving: cursor advances after each successful match so + * duplicate strings resolve left-to-right. + * @param source - The full text the model was run against. + * @param entities - Entities returned by the native runner. + * @returns Sorted, non-overlapping char-range matches. + */ +export function matchEntities( + source: string, + entities: PiiEntity[] +): EntityMatch[] { + const matches: EntityMatch[] = []; + let cursor = 0; + for (const e of entities) { + if (!e.text) continue; + const exact = findWordBounded(source, e.text, cursor); + if (exact !== -1) { + matches.push({ + start: exact, + end: exact + e.text.length, + label: e.label, + }); + cursor = exact + e.text.length; + continue; + } + const words = e.text.split(/\s+/).filter((w) => w.length > 1); + let localCursor = cursor; + for (const w of words) { + const idx = findWordBounded(source, w, localCursor); + if (idx === -1) continue; + matches.push({ start: idx, end: idx + w.length, label: e.label }); + localCursor = idx + w.length; + } + if (localCursor > cursor) cursor = localCursor; + } + matches.sort((a, b) => a.start - b.start); + return matches; +} + +export interface Segment { + text: string; + label: string | null; +} + +/** + * Slice the source text into alternating plain / labeled runs based on the + * matched entity ranges. Overlaps are dropped (the earlier match wins). + * @param source - The full text to slice. + * @param matches - Char-range matches from {@link matchEntities}. + * @returns An ordered array of segments covering the entire source. + */ +export function buildSegments( + source: string, + matches: EntityMatch[] +): Segment[] { + const segments: Segment[] = []; + let pos = 0; + for (const m of matches) { + if (m.start < pos) continue; + if (m.start > pos) { + segments.push({ text: source.slice(pos, m.start), label: null }); + } + segments.push({ text: source.slice(m.start, m.end), label: m.label }); + pos = m.end; + } + if (pos < source.length) { + segments.push({ text: source.slice(pos), label: null }); + } + return segments; +} + +/** + * Pastel color palette + stable label-to-color mapping. Same label always + * gets the same color across renders / runs. + */ +const PALETTE = [ + '#ffd4a8', + '#b8e1ff', + '#d4c5f9', + '#c3e8c3', + '#ffe8b8', + '#f8c6c6', + '#e3c8a8', + '#ff9aa2', + '#b6e3d4', + '#ffd6e0', + '#cdb4db', + '#ffc8a2', + '#a2d2ff', + '#bde0fe', + '#fcd5ce', +]; + +export function colorForLabel(label: string): string { + let hash = 0; + for (let i = 0; i < label.length; i++) { + // eslint-disable-next-line no-bitwise + hash = (hash * 31 + label.charCodeAt(i)) | 0; + } + return PALETTE[Math.abs(hash) % PALETTE.length] as string; +} diff --git a/docs/docs/03-hooks/01-natural-language-processing/usePrivacyFilter.md b/docs/docs/03-hooks/01-natural-language-processing/usePrivacyFilter.md new file mode 100644 index 0000000000..c0155dfbbf --- /dev/null +++ b/docs/docs/03-hooks/01-natural-language-processing/usePrivacyFilter.md @@ -0,0 +1,151 @@ +--- +title: usePrivacyFilter +keywords: + [ + privacy filter, + pii detection, + pii, + personally identifiable information, + privacy, + redaction, + react native, + executorch, + ai, + machine learning, + on-device, + mobile ai, + ] +description: "Detect personally identifiable information (PII) in text on-device with React Native ExecuTorch's usePrivacyFilter hook." +--- + +Privacy Filter is a token-level model that scans text for personally identifiable information (PII) — names, emails, phone numbers, addresses, SSNs, secrets, and more — and returns the detected spans together with the entity type. Inference runs entirely on-device, so the input text never leaves the user's phone. + +:::info +It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/collections/software-mansion/privacy-filter). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. +::: + +## API Reference + +- For detailed API Reference for `usePrivacyFilter` see: [`usePrivacyFilter` API Reference](../../06-api-reference/functions/usePrivacyFilter.md). +- For all Privacy Filter models available out-of-the-box in React Native ExecuTorch see: [Privacy Filter Models](../../06-api-reference/index.md#models---privacy-filter). + +## High Level Overview + +```typescript +import { + usePrivacyFilter, + PRIVACY_FILTER_OPENAI, +} from 'react-native-executorch'; + +const model = usePrivacyFilter({ model: PRIVACY_FILTER_OPENAI }); + +try { + const entities = await model.generate( + 'My name is Sarah Chen and my email is sarah@example.com.' + ); + console.log(entities); + // [ + // { label: 'private_person', text: 'Sarah Chen', startToken: 3, endToken: 5 }, + // { label: 'private_email', text: 'sarah@example.com', startToken: 11, endToken: 14 }, + // ] +} catch (error) { + console.error(error); +} +``` + +### Arguments + +`usePrivacyFilter` takes [`PrivacyFilterProps`](../../06-api-reference/interfaces/PrivacyFilterProps.md) that consists of: + +- `model` of type [`PrivacyFilterModelSources`](../../06-api-reference/interfaces/PrivacyFilterModelSources.md) containing the model source, tokenizer source, and BIOES label list. +- An optional flag [`preventLoad`](../../06-api-reference/interfaces/PrivacyFilterProps.md#preventload) which prevents auto-loading of the model. + +You need more details? Check the following resources: + +- For detailed information about `usePrivacyFilter` arguments check this section: [`usePrivacyFilter` arguments](../../06-api-reference/functions/usePrivacyFilter.md#parameters). +- For all Privacy Filter models available out-of-the-box in React Native ExecuTorch see: [Privacy Filter Models](../../06-api-reference/index.md#models---privacy-filter). +- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. + +### Returns + +`usePrivacyFilter` returns an object called `PrivacyFilterType` containing a `generate` function for running detection. To get more details please read: [`PrivacyFilterType` API Reference](../../06-api-reference/interfaces/PrivacyFilterType.md). + +## Running the model + +To run the model, call the [`generate`](../../06-api-reference/interfaces/PrivacyFilterType.md#generate) method with the text you want to scan. The method returns a promise that resolves to an array of [`PiiEntity`](../../06-api-reference/interfaces/PiiEntity.md) objects, each describing one detected span (`label`, decoded `text`, and inclusive `startToken` / exclusive `endToken` indices into the tokenized input). + +Inputs are processed in sliding windows with 50% overlap (the window size matches the model's exported `forward` input shape), so there is no length limit — long documents are scanned end-to-end without truncation. + +:::note +Token indices in returned entities are positions in the tokenizer's output (the unpadded `encode()` stream), not character offsets in the original string. Use the entity's decoded `text` field if you want to display or redact spans verbatim. +::: + +### Tuning precision and recall + +Both built-in models ship with neutral, validity-only Viterbi decoding by default. If you want to shift the precision/recall tradeoff, pass an optional [`viterbiBiases`](../../06-api-reference/interfaces/PrivacyFilterModelSources.md#viterbibiases) object — six floats matching the operating-point schema in OpenAI's `viterbi_calibration.json`. Negative `backgroundToStart` makes the decoder enter spans more eagerly (higher recall); positive `backgroundStay` keeps it in the background label more often (higher precision). + +## Example + +```tsx +import React, { useState } from 'react'; +import { Button, Text, View, TextInput, ScrollView } from 'react-native'; +import { + usePrivacyFilter, + PRIVACY_FILTER_OPENAI, + PiiEntity, +} from 'react-native-executorch'; + +export default function App() { + const model = usePrivacyFilter({ model: PRIVACY_FILTER_OPENAI }); + const [text, setText] = useState( + 'My name is Sarah Chen and you can reach me at sarah.chen@example.com.' + ); + const [entities, setEntities] = useState([]); + + const handleScan = async () => { + if (!model.isReady) { + console.error('Privacy Filter model is not loaded yet.'); + return; + } + try { + const detected = await model.generate(text); + setEntities(detected); + } catch (error) { + console.error('Error during running Privacy Filter model', error); + } + }; + + return ( + + +