Skip to content

Commit 0f21b10

Browse files
committed
First commit. Available databases to map with: openIO-Canada, exiobase, IW+2.0
0 parents  commit 0f21b10

File tree

8 files changed

+1314
-0
lines changed

8 files changed

+1314
-0
lines changed

Data/IOIC_sectors.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

Data/IW_2.0_flows.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

Data/exiobase_sectors.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
## Installation
2+
Install necessary module with pip
3+
~~~
4+
pip install -U sentence-transformers
5+
~~~
6+
7+
Install necessary module with conda
8+
~~~
9+
conda install -c conda-forge sentence-transformers
10+
~~~
11+
12+
## Getting started
13+
Check the demo.ipynb notebook for a general demo
14+
Each other notebook is tailored for mapping with specific databases (openIO-Canada, IMPACT World+ or exiobase)
15+
16+
## Credit
17+
This module is reusing the great work of https://github.com/UKPLab/sentence-transformers

demo.ipynb

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 10,
6+
"id": "32c6bfd6",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import json\n",
11+
"import pandas as pd\n",
12+
"\n",
13+
"# sentence_transformers from here: https://github.com/UKPLab/sentence-transformers\n",
14+
"# just pip install it\n",
15+
"from sentence_transformers import (SentenceTransformer, util)"
16+
]
17+
},
18+
{
19+
"cell_type": "code",
20+
"execution_count": 2,
21+
"id": "c85d2644",
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"# import and select the machine learning model\n",
26+
"# types of models available here: https://www.sbert.net/docs/pretrained_models.html\n",
27+
"model = SentenceTransformer('all-MiniLM-L6-v2')"
28+
]
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": 7,
33+
"id": "dba50e72",
34+
"metadata": {},
35+
"outputs": [],
36+
"source": [
37+
"# load the list of sectors/names to be used as reference\n",
38+
"with open('C://Users/11max/PycharmProjects/Mapping_ML/Data/IOIC_sectors.json','r') as f:\n",
39+
" IOIC_sectors = json.load(f)\n",
40+
"# load the reference list into the machine learning model\n",
41+
"IOIC_embeddings = model.encode(IOIC_sectors)"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": 4,
47+
"id": "efa5a558",
48+
"metadata": {},
49+
"outputs": [],
50+
"source": [
51+
"# enter a list of the names/products to be matched to the reference\n",
52+
"products = ['ADPE System Configuration','Geophysical Instruments']\n",
53+
"# load those names/products in the machine learning model\n",
54+
"products_embeddings = model.encode(products)"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 8,
60+
"id": "60fc32c4",
61+
"metadata": {},
62+
"outputs": [],
63+
"source": [
64+
"# calculate the similarity between each names/products to-be-matched and the reference list\n",
65+
"scores = util.pytorch_cos_sim(products_embeddings, IOIC_embeddings)\n",
66+
"# sort and extract indices of each scores\n",
67+
"sorted_scores, indices = scores.sort(dim=1, descending=True)"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": 15,
73+
"id": "e3f4ea35",
74+
"metadata": {},
75+
"outputs": [],
76+
"source": [
77+
"# store data in a nice dataframe\n",
78+
"df_results = pd.DataFrame(None, ['order', 'sector', 'similarity'])\n",
79+
"\n",
80+
"# the number of similarities per name/product to-be-matched that will be provided\n",
81+
"# you can see this as the number of attempts the algorithm is trying to matched products to reference\n",
82+
"number_of_matches = 5\n",
83+
"\n",
84+
"for i, product in enumerate(products):\n",
85+
" for j in range(0, number_of_matches):\n",
86+
" df_results = pd.concat([df_results, \n",
87+
" pd.DataFrame([product, \n",
88+
" j+1,\n",
89+
" IOIC_sectors[indices[i][j].cpu().numpy()], \n",
90+
" sorted_scores[i][j].cpu().numpy().tolist()],\n",
91+
" ['product', 'order', 'sector', 'similarity'])],\n",
92+
" axis=1)\n",
93+
" \n",
94+
"df_results = df_results.T.set_index(['product', 'order'])"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": 16,
100+
"id": "a596ead3",
101+
"metadata": {},
102+
"outputs": [
103+
{
104+
"data": {
105+
"text/html": [
106+
"<div>\n",
107+
"<style scoped>\n",
108+
" .dataframe tbody tr th:only-of-type {\n",
109+
" vertical-align: middle;\n",
110+
" }\n",
111+
"\n",
112+
" .dataframe tbody tr th {\n",
113+
" vertical-align: top;\n",
114+
" }\n",
115+
"\n",
116+
" .dataframe thead th {\n",
117+
" text-align: right;\n",
118+
" }\n",
119+
"</style>\n",
120+
"<table border=\"1\" class=\"dataframe\">\n",
121+
" <thead>\n",
122+
" <tr style=\"text-align: right;\">\n",
123+
" <th></th>\n",
124+
" <th></th>\n",
125+
" <th>sector</th>\n",
126+
" <th>similarity</th>\n",
127+
" </tr>\n",
128+
" <tr>\n",
129+
" <th>product</th>\n",
130+
" <th>order</th>\n",
131+
" <th></th>\n",
132+
" <th></th>\n",
133+
" </tr>\n",
134+
" </thead>\n",
135+
" <tbody>\n",
136+
" <tr>\n",
137+
" <th rowspan=\"5\" valign=\"top\">ADPE System Configuration</th>\n",
138+
" <th>1</th>\n",
139+
" <td>Office administrative services</td>\n",
140+
" <td>0.254899</td>\n",
141+
" </tr>\n",
142+
" <tr>\n",
143+
" <th>2</th>\n",
144+
" <td>Computer systems design and related services (...</td>\n",
145+
" <td>0.237132</td>\n",
146+
" </tr>\n",
147+
" <tr>\n",
148+
" <th>3</th>\n",
149+
" <td>Custom software design and development services</td>\n",
150+
" <td>0.226263</td>\n",
151+
" </tr>\n",
152+
" <tr>\n",
153+
" <th>4</th>\n",
154+
" <td>Advertising, public relations, and related ser...</td>\n",
155+
" <td>0.223295</td>\n",
156+
" </tr>\n",
157+
" <tr>\n",
158+
" <th>5</th>\n",
159+
" <td>Facilities and other support services</td>\n",
160+
" <td>0.201962</td>\n",
161+
" </tr>\n",
162+
" <tr>\n",
163+
" <th rowspan=\"5\" valign=\"top\">Geophysical Instruments</th>\n",
164+
" <th>1</th>\n",
165+
" <td>Measuring, control and scientific instruments</td>\n",
166+
" <td>0.544412</td>\n",
167+
" </tr>\n",
168+
" <tr>\n",
169+
" <th>2</th>\n",
170+
" <td>Navigational and guidance instruments</td>\n",
171+
" <td>0.40421</td>\n",
172+
" </tr>\n",
173+
" <tr>\n",
174+
" <th>3</th>\n",
175+
" <td>Other civil engineering works</td>\n",
176+
" <td>0.386468</td>\n",
177+
" </tr>\n",
178+
" <tr>\n",
179+
" <th>4</th>\n",
180+
" <td>Other professional, scientific and technical s...</td>\n",
181+
" <td>0.353572</td>\n",
182+
" </tr>\n",
183+
" <tr>\n",
184+
" <th>5</th>\n",
185+
" <td>Other electrical equipment and components</td>\n",
186+
" <td>0.315113</td>\n",
187+
" </tr>\n",
188+
" </tbody>\n",
189+
"</table>\n",
190+
"</div>"
191+
],
192+
"text/plain": [
193+
" sector \\\n",
194+
"product order \n",
195+
"ADPE System Configuration 1 Office administrative services \n",
196+
" 2 Computer systems design and related services (... \n",
197+
" 3 Custom software design and development services \n",
198+
" 4 Advertising, public relations, and related ser... \n",
199+
" 5 Facilities and other support services \n",
200+
"Geophysical Instruments 1 Measuring, control and scientific instruments \n",
201+
" 2 Navigational and guidance instruments \n",
202+
" 3 Other civil engineering works \n",
203+
" 4 Other professional, scientific and technical s... \n",
204+
" 5 Other electrical equipment and components \n",
205+
"\n",
206+
" similarity \n",
207+
"product order \n",
208+
"ADPE System Configuration 1 0.254899 \n",
209+
" 2 0.237132 \n",
210+
" 3 0.226263 \n",
211+
" 4 0.223295 \n",
212+
" 5 0.201962 \n",
213+
"Geophysical Instruments 1 0.544412 \n",
214+
" 2 0.40421 \n",
215+
" 3 0.386468 \n",
216+
" 4 0.353572 \n",
217+
" 5 0.315113 "
218+
]
219+
},
220+
"execution_count": 16,
221+
"metadata": {},
222+
"output_type": "execute_result"
223+
}
224+
],
225+
"source": [
226+
"df_results"
227+
]
228+
},
229+
{
230+
"cell_type": "code",
231+
"execution_count": null,
232+
"id": "485c61a8",
233+
"metadata": {},
234+
"outputs": [],
235+
"source": []
236+
}
237+
],
238+
"metadata": {
239+
"kernelspec": {
240+
"display_name": "Python 3 (ipykernel)",
241+
"language": "python",
242+
"name": "python3"
243+
},
244+
"language_info": {
245+
"codemirror_mode": {
246+
"name": "ipython",
247+
"version": 3
248+
},
249+
"file_extension": ".py",
250+
"mimetype": "text/x-python",
251+
"name": "python",
252+
"nbconvert_exporter": "python",
253+
"pygments_lexer": "ipython3",
254+
"version": "3.9.7"
255+
}
256+
},
257+
"nbformat": 4,
258+
"nbformat_minor": 5
259+
}

0 commit comments

Comments
 (0)