Skip to content

Commit da3530f

Browse files
committed
added auto complete endpoint
1 parent ae0ad34 commit da3530f

7 files changed

Lines changed: 453 additions & 24 deletions

File tree

api/models/eplant2.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,11 @@ class AgiAlias(db.Model):
7575

7676
agi: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
7777
alias: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
78+
79+
80+
class AgiNames(db.Model):
81+
__bind_key__ = "eplant2"
82+
__tablename__ = "agi_names"
83+
84+
agi: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
85+
name: db.Mapped[str] = db.mapped_column(db.String(255), nullable=False, primary_key=True)

api/resources/gene_information.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from api.models.eplant2 import TAIR10GFF3 as EPlant2TAIR10GFF3
88
from api.models.eplant2 import AgiAlias as EPlant2AgiAlias
99
from api.models.eplant2 import AgiAnnotation as EPlant2AgiAnnotation
10+
from api.models.eplant2 import AgiNames as EPlant2AgiNames
1011
from api.models.eplant_poplar import Isoforms as EPlantPoplarIsoforms
1112
from api.models.eplant_tomato import Isoforms as EPlantTomatoIsoforms
1213
from api.models.eplant_soybean import Isoforms as EPlantSoybeanIsoforms
@@ -447,6 +448,88 @@ def get(self, species="", term=""):
447448
return BARUtils.success_exit(genes_info)
448449

449450

451+
@gene_information.route("/id_autocomplete")
452+
class IdAutocomplete(Resource):
453+
@gene_information.param("species", _in="query", default="arabidopsis")
454+
@gene_information.param("term", _in="query", default="AT1G010")
455+
@gene_information.param("limit", _in="query", default=15)
456+
def get(self):
457+
"""Return autocomplete suggestions for a gene ID or alias search term.
458+
"""
459+
species = escape(request.args.get("species", ""))
460+
term = escape(request.args.get("term", ""))
461+
limit_raw = request.args.get("limit", "15")
462+
463+
if not species or not term:
464+
return BARUtils.error_exit("Missing species or term"), 400
465+
466+
if len(term) < 2:
467+
return BARUtils.error_exit("term must be at least 2 characters"), 400
468+
469+
if not BARUtils.is_integer(limit_raw):
470+
return BARUtils.error_exit("limit must be a positive integer"), 400
471+
472+
limit = int(limit_raw)
473+
if limit < 1 or limit > 50:
474+
return BARUtils.error_exit("limit must be between 1 and 50"), 400
475+
476+
if species == "arabidopsis":
477+
alias_db = EPlant2AgiAlias
478+
names_db = EPlant2AgiNames
479+
gff3_db = EPlant2TAIR10GFF3
480+
else:
481+
return BARUtils.error_exit("No data for the given species"), 400
482+
483+
results = []
484+
seen_agis = set()
485+
486+
# 1. Search agi_alias by AGI or alias
487+
alias_query = (
488+
db.select(alias_db.agi, alias_db.alias)
489+
.where(alias_db.agi.ilike(f"%{term}%") | alias_db.alias.ilike(f"%{term}%"))
490+
.limit(limit)
491+
)
492+
for row in db.session.execute(alias_query).all():
493+
if row.agi not in seen_agis:
494+
seen_agis.add(row.agi)
495+
results.append({"agi": row.agi, "match": row.alias})
496+
if len(results) >= limit:
497+
break
498+
499+
# 2. Search agi_names by AGI or name (only if we still have room)
500+
if len(results) < limit:
501+
names_query = (
502+
db.select(names_db.agi, names_db.name)
503+
.where(names_db.agi.ilike(f"%{term}%") | names_db.name.ilike(f"%{term}%"))
504+
.limit(limit - len(results))
505+
)
506+
for row in db.session.execute(names_query).all():
507+
if row.agi not in seen_agis:
508+
seen_agis.add(row.agi)
509+
results.append({"agi": row.agi, "match": row.name})
510+
if len(results) >= limit:
511+
break
512+
513+
# 3. Fallback: raw gene IDs from tair10_gff3 (only if we still have room)
514+
if len(results) < limit:
515+
gff3_query = (
516+
db.select(gff3_db.geneId)
517+
.where(
518+
(gff3_db.Type == "gene") | (gff3_db.Type == "transposable_element_gene"),
519+
gff3_db.geneId.ilike(f"%{term}%"),
520+
)
521+
.limit(limit - len(results))
522+
)
523+
for row in db.session.execute(gff3_query).all():
524+
if row.geneId not in seen_agis:
525+
seen_agis.add(row.geneId)
526+
results.append({"agi": row.geneId, "match": row.geneId})
527+
if len(results) >= limit:
528+
break
529+
530+
return BARUtils.success_exit(results)
531+
532+
450533
@gene_information.route("/gene_isoforms/<string:species>/<string:gene_id>")
451534
class GeneIsoforms(Resource):
452535
@gene_information.param("species", _in="path", default="arabidopsis")

archive/utea/week-01.md

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# UTEA Progress Report
2+
3+
**Name:** Reena Obmina
4+
**Project:** BAR API / ePlant Modernisation
5+
**Supervisor:** Vincent Lau & Nicholas Provart
6+
**Week:** 01 (April 13 - April 17)
7+
8+
---
9+
10+
## 1. Overview
11+
12+
Project kick-off, initial planning, and research. First meeting held to align on scope and establish a proposal and timeline.
13+
14+
---
15+
16+
## 2. Work Completed
17+
18+
- Identified all CGI script locations used across ePlant
19+
- Established proposal structure and 16-week project timeline
20+
21+
---
22+
23+
## 3. Technical Details
24+
25+
Met with Vincent and Dr. Provart remotely on Discord to discuss project scope, timeline, and expectations.
26+
27+
---
28+
29+
## 4. Challenges & Solutions
30+
31+
| Challenge | Solution |
32+
|-----------|----------|
33+
| Many CGI scripts still in use across the codebase | Prioritised by complexity; phased 16-week migration plan |
34+
35+
---
36+
37+
## 5. Results / Outcomes
38+
39+
- Gained understanding of CGI script history and the motivation for removing them
40+
- Produced a full inventory of 9 CGI endpoints to migrate with file locations and complexity ratings
41+
42+
---
43+
44+
## 6. Next Steps
45+
46+
- Convert `/gene_information/gene_query` endpoint from POST to GET
47+
- Implement autocomplete feature
48+
49+
---
50+
51+
## 7. Notes / Observations
52+
53+
A compiled list of the CGI endpoints that need to be migrated.
54+
55+
56+
| # | Endpoint | File(s) | Complexity |
57+
|---|----------|---------|------------|
58+
| 1 | `idautocomplete.cgi` | `Species/arabidopsis/index.ts:13` | Low |
59+
| 2 | `querygene.cgi` | `Species/arabidopsis/index.ts:22`, `InteractionsViewer/scripts/eventHandlers.tsx:58` | Low |
60+
| 3 | `get_rank.php` | `views/eFP/Viewer/GeneDistributionChart.tsx:19` | Low |
61+
| 4 | `chromosomeinfo.cgi` | `views/ChromosomeViewer/ChromosomeView.tsx:145` | Medium |
62+
| 5 | `querygenesbyposition.cgi` | `views/ChromosomeViewer/Viewer/GeneList.tsx:45` | Medium |
63+
| 6 | `groupsuba4.php` | `InteractionsViewer/scripts/loadSublocalizations.tsx:78`, `CellEFP/CellEFPDataObject/index.tsx:43` | Medium (POST) |
64+
| 7 | `get_interactions_dapseq.py` | `views/InteractionsViewer/InteractionsView.tsx:236` | Medium |
65+
| 8 | `plantefp.cgi` | `views/eFP/index.tsx:63` | High (chunked, progress callbacks) |
66+
| 9 | `eplant_navigator_service.cgi` | `views/NavigatorView/NavigatorView.tsx:995` | High (multi-species tree) |
67+
68+
---
69+
70+
### 16-Week Roadmap
71+
72+
**Phase 1 — Foundational Work (Weeks 1–4)**
73+
74+
- **Wk 1–2:** `idautocomplete.cgi``Species/arabidopsis/index.ts:13`. Simple GET, returns gene ID suggestions. *(finishing this week)*
75+
- **Wk 3–4:** `querygene.cgi` — two call sites (`index.ts:22` and `eventHandlers.tsx:58`). Wire up REST replacement in both and verify tooltip generation still works in InteractionsViewer.
76+
77+
**Phase 2 — Chromosome Viewer (Weeks 5–8)**
78+
79+
- **Wk 5–6:** `chromosomeinfo.cgi``ChromosomeViewer/ChromosomeView.tsx:145`. Returns chromosome structural data; test that the diagram renders correctly.
80+
- **Wk 7–8:** `querygenesbyposition.cgi``ChromosomeViewer/Viewer/GeneList.tsx:45`. Depends on chromosome data; do after chromosomeinfo. Test gene list populating on chromosome region click.
81+
82+
**Phase 3 — Expression & Ranking (Weeks 9–11)**
83+
84+
- **Wk 9–10:** `get_rank.php``eFP/Viewer/GeneDistributionChart.tsx:19`. Single call returning a percentile; verify distribution chart renders correctly after.
85+
- **Wk 11:** Integration testing of Phases 1–3, catch regressions, supervisor check-in.
86+
87+
**Phase 4 — Interactions & Localization (Weeks 12–14)**
88+
89+
- **Wk 12–13:** `groupsuba4.php` — two call sites (`loadSublocalizations.tsx:78` and `CellEFPDataObject/index.tsx:43`). Only POST endpoint in the list; verify subcellular localisation overlays in both the Interactions and Cell EFP views.
90+
- **Wk 14:** `get_interactions_dapseq.py``InteractionsViewer/InteractionsView.tsx:236`. DAP-seq interaction graph data; test that the force-directed graph still loads.
91+
92+
**Phase 5 — Heavy lifters (Weeks 15–16)**
93+
94+
- **Wk 15–16:** `plantefp.cgi``views/eFP/index.tsx:63`. Chunked fetch with progress callbacks — most complex call in the codebase. Sample batching logic will need rethinking for the new API.
95+
- `eplant_navigator_service.cgi`? Multi-species, returns a nested tree (`NavigatorView.tsx:995`). May run into Week 17+ or be scoped separately.
96+
97+
---
98+
99+
### CGI (Common Gateway Interface)
100+
101+
What is CGI?
102+
- CGI = Common Gateway Interface
103+
- Early web standard for connecting web servers to programs/scripts
104+
- Flow:
105+
- Server receives request
106+
- Launches a new process
107+
- Passes data via environment variables or stdin
108+
- Script outputs response to stdout
109+
- Commonly used with:
110+
- Perl
111+
- Python
112+
- PHP
113+
114+
Why CGI Is Being Deprecated/Replaced?
115+
- Inefficient:
116+
- New process per request makes it slow and resource-heavy
117+
- Poor scalability:
118+
- Does not handle high traffic well
119+
- Outdated architecture:
120+
- No persistent state or connection handling
121+
- Harder to maintain:
122+
- Requires manual request and response parsing
123+
- Replaced by:
124+
- Persistent application servers
125+
- Structured frameworks with routing, middleware, and APIs
126+
127+
As for CGI in Python...
128+
- `cgi` module:
129+
- Deprecated in Python 3.11
130+
- Removed in Python 3.13
131+
- Part of PEP 594 (removal of outdated standard library modules)
132+
- Also deprecated:
133+
- `CGIHTTPRequestHandler`
134+
- `http.server --cgi`
135+
- Reason:
136+
- Shift toward modern web architectures
137+
138+
---
139+
140+
Perl
141+
- CGI is not removed from the language
142+
- `CGI.pm` module still exists and is maintained
143+
- Still usable but considered legacy
144+
- Modern Perl applications often use:
145+
- FastCGI
146+
- mod_perl
147+
- Web frameworks (e.g., Dancer, Mojolicious)
148+
149+
PHP
150+
- CGI is still supported via CGI SAPI (`php-cgi`)
151+
- Not commonly used in modern deployments
152+
- Standard modern setup:
153+
- PHP-FPM (FastCGI Process Manager)
154+
- Key idea:
155+
- CGI exists but is not the preferred architecture

archive/utea/week-02.md

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# UTEA Progress Report
2+
3+
**Name:** Reena Obmina
4+
**Project:** BAR API / ePlant Modernisation
5+
**Supervisor:** Vincent Lau & Nicholas Provart
6+
**Week:** 02 (April 20 - April 24)
7+
8+
---
9+
10+
## 1. Overview
11+
12+
This week focused on two tasks: converting the `gene_query` endpoint from POST to GET, and implementing a new `id_autocomplete` endpoint to replace the legacy `idautocomplete.cgi` script.
13+
14+
---
15+
16+
## 2. Work Completed
17+
18+
- Implemented:
19+
- `GET /gene_information/id_autocomplete` is a new endpoint replacing `idautocomplete.cgi`
20+
- `AgiNames` SQLAlchemy model in `api/models/eplant2.py` (table was used by the CGI but had no Python representation in the BAR API)
21+
- Fixed:
22+
- SQL injection vulnerability in the CGI's raw string query construction — replaced with SQLAlchemy parameterized queries
23+
- Broken limit math in the CGI (`LIMIT -15` crash) which is fixed by tracking remaining budget across all three queries
24+
- (`except: print("{}")`) is replaced with proper HTTP 400 responses
25+
- O(n²) becomes O(1)
26+
- Converted:
27+
- `gene_query` endpoint from POST to GET
28+
- Added:
29+
- `agi_names` table definition and sample rows to `config/databases/eplant2.sql` for test coverage
30+
- 9 test cases for `id_autocomplete` in `tests/resources/test_gene_information.py`
31+
32+
---
33+
34+
## 3. Technical Details
35+
36+
**Tools / languages used:** Python, Flask-RESTX, SQLAlchemy, SQLite (test), pytest
37+
38+
**`gene_query` POST → GET conversion:**
39+
GET is more appropriate here because the endpoint reads data and changes nothing. POST requests also cannot be bookmarked or called directly from a browser address bar. The conversion involved switching the decorator from `@gene_information.expect(...)` with a JSON body to `@gene_information.param(...)` with query string arguments, and reading values from `request.args` instead of `request.get_json()`.
40+
41+
**`id_autocomplete` design:**
42+
The endpoint queries three tables in priority order, `agi_alias`, `agi_names`, then `tair10_gff3` as a fallback. Then it returns up to `limit` results (default 15, max 50). A single `seen_agis` Python set spans all three queries to prevent duplicate AGIs in the response. The remaining capacity is recalculated before each query so the limit is never exceeded and no negative `LIMIT` values are generated.
43+
44+
**Key design decisions:**
45+
- Minimum term length of 2 characters enforced before any DB query runs, preventing expensive full-table scans
46+
- Output changed from flat strings (`"AT1G01010/ANAC001"`) to structured objects (`{"agi": "AT1G01010", "match": "ANAC001"}`) and the `match` field identifies which alias or name triggered the hit, which is more useful for UI highlighting
47+
- `limit` is an optional query parameter (1–50) rather than hardcoded, giving callers control over response size
48+
49+
**Files touched:**
50+
51+
| File | Change |
52+
|---|---|
53+
| `api/resources/gene_information.py` | Added `IdAutocomplete` resource class; converted `gene_query` to GET |
54+
| `api/models/eplant2.py` | Added `AgiNames` model |
55+
| `config/databases/eplant2.sql` | Added `agi_names` table schema and seed data |
56+
| `tests/resources/test_gene_information.py` | Added `test_id_autocomplete` with 9 test cases |
57+
58+
---
59+
60+
## 4. Challenges & Solutions
61+
62+
| Challenge | Solution |
63+
|---|---|
64+
| `agi_names` table used by the CGI but not modelled in the BAR API | Added `AgiNames` SQLAlchemy model; inferred column structure (`agi`, `name`) from the CGI's SQL, needs confirmation that production schema matches |
65+
| CGI's limit math produced `LIMIT -15`, crashing silently | Track `len(results)` across all three queries; pass `limit - len(results)` to each subsequent query |
66+
| Stale SQLite test mirror did not include new `agi_names` table | Deleted cached `eplant2.db` to force rebuild from updated `eplant2.sql` on next test run |
67+
| CGI's deduplication only checked the third query against the first two, using a slow prefix scan | Replaced with a `seen_agis` set populated from all three queries; exact-match lookup, O(1) per result |
68+
69+
---
70+
71+
## 5. Results / Outcomes
72+
73+
- All 8 existing `gene_information` tests continue to pass
74+
- 9 new tests added for `id_autocomplete`, all passing
75+
- SQL injection vulnerability in `idautocomplete.cgi` eliminated
76+
- Broken `LIMIT -15` crash fixed
77+
- Output changed to structured JSON objects, consistent with the rest of the BAR API response format
78+
79+
---
80+
81+
## 6. Next Steps
82+
83+
- Confirm that `agi_names` exists in production `eplant2` DB and that column names (`agi`, `name`) match
84+
- Check whether ePlant frontend expects the old `"AT1G01010/ANAC001"` flat string format or can consume the new `{"agi", "match"}` objects
85+
86+
---
87+
88+
## 7. Notes / Observations

archive/utea/week-03.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# UTEA Progress Report
2+
3+
**Name:** Reena Obmina
4+
**Project:** BAR API / ePlant Modernisation
5+
**Supervisor:** Vincent Lau & Nicholas Provart
6+
**Week:** 03 (April 27 - May 1)
7+
8+
---
9+
10+
## 1. Overview
11+
12+
## 2. Work Completed
13+
14+
## 3. Technical Details
15+
16+
## 4. Challenges & Solutions
17+
18+
## 5. Results / Outcomes
19+
20+
## 6. Next Steps
21+
22+
## 7. Notes / Observations (Optional)

0 commit comments

Comments
 (0)