{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET\n", "import string\n", "import os \n", "import re\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train_data_1 = pd.read_csv(\"OpenVirus-merged.csv\")\n", "train_data = train_data_1.iloc[1:] #remove the first row because it appears as data but is acually the title only \n", "train_data = train_data.sample(frac=1).reset_index(drop=True) #shufffle the dataset\n", "\n", "#train_data_1 = train_data_1.iloc[1:] #remove the first row because it appears as data but is acually the title only \n", "#train_data = train_data_1.sample(frac=1).reset_index(drop=True) #shufffle the dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ArticleIdTextCategory
0PMC6200010french polynesia was the most probable source ...viral_epidemic
1PMC6142641one of the viral diseases of zoonotic concern ...viral_epidemic
2PMC6538675avian influenza ai is a highly contagious vi...viral_epidemic
3PMC7305475with the recent development of multiple tools ...viral_epidemic
4PMC6538675undoubtedly the rapid and continuous evolutio...viral_epidemic
\n", "
" ], "text/plain": [ " ArticleId Text \\\n", "0 PMC6200010 french polynesia was the most probable source ... \n", "1 PMC6142641 one of the viral diseases of zoonotic concern ... \n", "2 PMC6538675 avian influenza ai is a highly contagious vi... \n", "3 PMC7305475 with the recent development of multiple tools ... \n", "4 PMC6538675 undoubtedly the rapid and continuous evolutio... \n", "\n", " Category \n", "0 viral_epidemic \n", "1 viral_epidemic \n", "2 viral_epidemic \n", "3 viral_epidemic \n", "4 viral_epidemic " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data.head() #visualizing the data table" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "train_data['category_id'] = train_data['Category'].factorize()[0] # factorizing the output into zero and one" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data.groupby('Category').category_id.count().plot.bar(ylim=0)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer # term frequency & inverse document frequency (finding important words within text)\n", "tfidf = TfidfVectorizer(sublinear_tf=True, min_df=7, norm='l2', encoding='latin-1', ngram_range=(1, 3), stop_words='english')\n", "features = tfidf.fit_transform(train_data.Text).toarray()\n", "labels = train_data.category_id" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "category_to_id = {'true':0, 'false':1}\n", "id_to_category = {0: 'true', 1: 'fasle'}" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# 'false':\n", " . Most correlated unigrams:\n", " . media\n", " . swine\n", " . contagion\n", " . Most correlated bigrams:\n", " . zika virus\n", " . covid 19\n", " . sars cov\n", "# 'true':\n", " . Most correlated unigrams:\n", " . media\n", " . swine\n", " . contagion\n", " . Most correlated bigrams:\n", " . zika virus\n", " . covid 19\n", " . sars cov\n" ] } ], "source": [ "# Use chi-square analysis to find corelation between features (importantce of words) and labels(news category) \n", "from sklearn.feature_selection import chi2\n", "import numpy as np\n", "\n", "N = 3 # We are going to look for top 3 categories\n", "\n", "#For each category, find words that are highly corelated to it\n", "for Category, category_id in sorted(category_to_id.items()):\n", " features_chi2 = chi2(features, labels == category_id) # Do chi2 analyses of all items in this category\n", " indices = np.argsort(features_chi2[0]) # Sorts the indices of features_chi2[0] - the chi-squared stats of each feature\n", " feature_names = np.array(tfidf.get_feature_names())[indices] # Converts indices to feature names ( in increasing order of chi-squared stat values)\n", " unigrams = [v for v in feature_names if len(v.split(' ')) == 1] # List of single word features ( in increasing order of chi-squared stat values)\n", " bigrams = [v for v in feature_names if len(v.split(' ')) == 2] # List for two-word features ( in increasing order of chi-squared stat values)\n", " print(\"# '{}':\".format(Category))\n", " print(\" . Most correlated unigrams:\\n . {}\".format('\\n . '.join(unigrams[-N:]))) # Print 3 unigrams with highest Chi squared stat\n", " print(\" . Most correlated bigrams:\\n . {}\".format('\\n . '.join(bigrams[-N:]))) # Print 3 bigrams with highest Chi squared stat" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "##dimension reduction \n", "from sklearn.manifold import TSNE #low dimensionality plot\n", "\n", "# Sampling a subset of our dataset because t-SNE is computationally expensive\n", "SAMPLE_SIZE = int(len(features) * 1)\n", "np.random.seed(0)\n", "indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False) # Randomly select 30 % of samples\n", "projected_features = TSNE(n_components=2, random_state=0).fit_transform(features[indices]) # Array of all projected features of 30% of Randomly chosen samples " ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAECCAYAAAD3vwBsAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO2dfZRV1ZXgf1soQaIIAn5REEoaCQh+hIIQ6aSjMYgfUzix7WBMh0ycYcWPJLbp5YCOJjMT1jLRNsZW4mLsoI5fMWMSia2xopHY034gJmVEgQCKsZQEJGJwiARhzx/3FjyKV1Xv3XvOvee+u39r1ar7zv04+95339nn7L3PPqKqGIZhGOXjgLwFMAzDMPLBFIBhGEZJMQVgGIZRUkwBGIZhlBRTAIZhGCXFFIBhGEZJMQVgICLz4//nichn4u3xInJM5TEicqGIzIo/t4jI/xCRgT1c8wsicmQP+ySNnL1dT0SuFpET0tTZUz3dr1fx3M7otu8TIjK91vq6yyciw0RkahL5apE9Pu6w+DtdLCIHdr9GXzLUcP0z+j7KyJv+eQtgBMExccN/NnBnXHYU8J6IfA54m+hdmQXcEO8/HdgNnCMio4GtwHPAKcA7QDPwGRE5WFUXxg3TM8AZwD0ici5wgKr+N4gaHVW9VkSuBF4Buq75GjAJeBo4XkTOAsYCTcAq4FNAB3CHiDQDxwNPishCQID/BVwN3AR0iMingKnA+8AD8T33Bx4H/gOwHOgnIv8ADFXVa2L5zgY+CvwE+GxcZ9dzaxGRJ+N61gCvdt0TUSfrGVX9RdfDFpEvAMOBg4HbgWuA20Xk9AqZx4pIK3BIfL0mYBjwJDBFRP4mlueA+LmeAOwAhsZ1HAB8M65yEfBt4AXgeVV9TFX/CFwbyzgA+Etc3xnANOC+WIbPABuBFuAvwG1AW0W9ZwMvAx8A/gQcCLQDJ4hIP+DY+Fn/CTg8/vtJXMezqvrvGLlhIwAD4BVV/QHwErBKRC4jahQgapDuJPoRP6+qT8flq4GfAX8NbCJqzA4havwnEjXcPwB2xcf3i/8/RKQ8OgEVka56XhGRc4gaqcprnqyq/6SqTwG/UdV/JWqY/wn4CPBHVb0DQFU7geeBI4H/HddxOPCEqnbE9XwSeBMYCAwiatSOBWYCN6jqo4AC3wX+XPGMDgFeJ2p0u+rsem4QKaknVXVJxTmtcV1HVhkp/ZRIwR0KPEGkcCtlJr6/TUSN+nhV/WdV7WrEf1l5fWCwqi4iUsoAI4DfAXcBM4i+228DU7oEEJGPAa+p6ra46FTgOmBlhZxb42e9CfhnYHK3ereo6m3AQXEdQ9n7XU9R1RuIFBfA3cBbwHqitmcQRq6YAjAAOkXkixA1oqp6I1Fvkvj/ub2c+2/AYcBvgWOA94iUxwbgi8CLIvJVYEJ8/E4ixdEMvK6qXfU8BFxF1HusvOZTInK5iJwMbBORTwNvi8jXgGfj63XnKeDv4zo2dTvmcaKGay3wV8D/i+VtBy6Pe+G7VXU3kSLoYjSRMjyg4np7nhvwIvBxEflPFec8BwwmGqlc1k3GvwWOJlKYO6vIrESjniFEynaNiFwqIpOJLEandbv+H0XkfPaO6jcBHwQ+B/w7sEujaf9CdIEjiEYIR4jIofE5TwD/mb3fFUSKHyJF2XV+Zb27Ko49DNhONFoAWCEilxM1+lQcOxzYRjRymoORG2KpIAzDPyIyXFXfire/APxMVX/fw7HnA+tU9bkMRTRKiCkAwzCMkmImIMMwjJJiCsAwDKOkmAIwDMMoKYWZBzB8+HAdM2ZM3mIYhmEUiueff/4tVR1RbV9hFMCYMWNYsWJF3mIYhmEUChF5rad9ZgIyDMMoKaYADMMwSoopAMMwjJJiCsAwDKOkmAIwDMMoKc4UgIj0E5Ffi8hD8efDROTnIrI2/j+04tgFIrJORNbEybcMwzCMjHE5AvgqUXbALuYDj6vqOKIMjF2LZ0wE5gDHEeWXXxTnDTeMoFm6ZimXPnwpS9cszVsUw3CCEwUQL8RxFtFiEV3MBu6It+8Azqkov09Vd6jqq8A6osUhDCNYlq5ZyvkPnM8tz93C+Q+cb0rAaAhcjQBuBK5g72IUAEeo6kaA+H/XIhcjiRbW6KIzLtsPEZknIitEZMXmzZsdiWoY9dO+vp3tO7cDsH3ndtrXt+cskWGkJ7UCiJfK26Sqz9d6SpWyqjmpVXWxqraqauuIEVVnMhtGJswcO5NBTdECVoOaBjFz7MycJcofM4kVHxepIGYAbSJyJtEye4NF5C7gDyJylKpuFJGjiFYogqjHP6ri/Gai5eUMI1jaxrdx77n30r6+nZljZ9I2vi1vkXKlyyS2fed2lnQs4d5z7y39MykiqUcAqrpAVZtVdQyRc/cXqvo5YCkwNz5sLvBgvL0UmCMiA0SkBRhHtBC3YQRN2/g2bj7zZu8NXRF61rmYxN7aCmtfi/4bTvA5D+Ba4FMishb4VPwZVX0JuB94mWht2EtUdVePVzEMitEouqAozubMTWJvbYVVr8Cbm6P/pgSc4DQbqKouA5bF21uAT/Zw3EJgocu6jcalTOaGaj3rEO81c5PY2+/A7jjGZPfu6PPwIX7rLAE2E9gInjJF4BTJ2ZyVSQyAoYfCAXFzdcAB0WcjNYVZD8AoIG9tjXpqQw9N1VubOXYmSzqWsH3n9uAbxbQ46Vk7eu5BMXwITDim8e4rZ0S1agRmcLS2tqotCFMgumy2u3dHPbYJx6T60S5ds9QicGrB8XM3io+IPK+qrdX22QigQcm9wXRss20b3+bnPhqtt2y2cqMOzAfQgAQRSVIEm20jRpYU4bkbwWAKoAEJwmnaZbM9ekS4ZohqveWiU4TnbgSDKYAGJJhIkuFDYNwHw22EGrW3HPpzN4LBfAANiKUtqJEsI0uK7GsIXfbQ5QsYiwIyDN8UOTIndNlDly8AeosCMhNQoJQl9UEpKLKvIXTZQ5cvcEwBBEgQUTxZ08iJvorsa8hS9iTvQJGfbQCYAgiQIKJ4sqQRwzErKXJkTlayJ30HivxsA8AUQIAEE8WTFWUYxvuIzMlq1JRFVFGad8CinhJjUUABUroonqGHwu+37HXkuRzGu44QCSXipNL5+fstxe/91voOJH3+GX9vuc/ErxGLAjJqx+ePyMe1XUeIhBRxsva1yFzSxdEjol5wkenrHUj6/DP+3irTlw9qGpR7+nKLAjLS49tO72MY79q05NtUVY9JpxGdn329A0mff8YmxiL58EwBGLVRRDu960bSZ6Nbr4Ito/Mz6fPPWFkWyYdnPgCjNnza6X3heqavz5nDSbJ4Dh+SS8Ofm3076fPPeC2BIvnwzAdg1E6jOlRDwIOd2kdDHZp9ew/2LvWI+QAMN7i00zd67H+9ODbp+JpMGKR9296lxJgCMPKhFp9CjrODc0nF4VDB+mqog7RvF9E/FQimAApOYXMG9eWYy7FX1wipOHw11F327UumXhKO+acRI6IyIrUTWERGAXcCRwK7gcWq+l0ROQz4ATAG2AD8naq+HZ+zALgQ2AV8RVUfTStHGam0xy7pWBLODxL6tsn25ZjLcWnDar3nYJ5rjfh0RHpbnjMptmB8YlyMAN4HvqaqE4DpwCUiMhGYDzyuquOAx+PPxPvmAMcBs4BFItLPgRylI0h7LNTee+/N5JFjry5zM4cnU1fb+DZuPvPmsBprX1g6iESkVgCqulFVfxVvbwNWASOB2cAd8WF3AOfE27OB+1R1h6q+CqwDpqWVo4wEaY8FNzbZHOPcnZo5+mrczYFp5IjTeQAiMgY4CXgWOEJVN0KkJETk8PiwkcAzFad1xmXVrjcPmAcwevRol6Jmi6cQtWDjjV3NGcgpzh0cmTlqyddTr6nLc7hjUXLYGG5wpgBE5GDgAeAyVf2TiPR4aJWyqpMRVHUxsBiieQAu5Mwcz0m7grPHQv422VBiwmtp3OtRlp7fJW8+pby/j7zrDxgnUUAi0kTU+N+tqj+Ki/8gIkfF+48CNsXlncCoitObgTddyBEkBQ9RSxxllJdNNiSTSi1+jHpMXZ7fJS8+pby/j7zrD5zUCkCirv6/AKtU9YaKXUuBufH2XODBivI5IjJARFqAccDytHIES4FD1AoZDpmFwq3VaVtr416rsvT8LnnxKeXdAcq7/moEtPqdCxPQDODvgRdFpCMuuxK4FrhfRC4EfgecB6CqL4nI/cDLRBFEl6jqLgdyhEne5pAUZBoO6WqY7jtnUb1mmK59XQ1Pmnvz/C558SnlnUMq7/q7E9g6DpYLyOiRzPK++Mjb70vh1puHP00O+wJ2GqqS973kXX8lOazj0FsuIMsGWnQ8vtyZRRm5nvRVa/RQkmdXb48yyb356CXm2QjmGM0VRP2VBDYiMQVQZDIYTmYSZZTHjyLps6vXDJPk3lwrRN/vSUg97BDlqSQwk7ApgCKTY7oEp+Txo0jz7HrrUXZvfJLcm2uF6PM9Ccym7UQe3wokoBGJJYMrMgWOMNqPrMNGfTy7nkIO670317Ogfb4noUXZpJWnZGGjNgIoMq57zi57PiEPw8HPqMNlT9tlL9HnCCswm3ZqeRplVF0jpgCKjquGwuVQPoMZq4kc09XMM43cGFbiy+wQmE07tTwhf4ceMAWQgobKm+Ky5+OxF5U4XUG9SinJCCZN4xP6iKk3+lIuWd9bGmUXmkLzjPkAElLIWbK94dJO7NHmnDhdQT224TR24CS+DEd256RpO7wuKlREm3qt32FAM3qTYgogIcHm4k+KS8ejx1TOidMV1KOUsnZsOqgvaYfEe0cmNCexK4qo2KpgCiAhwebiT4PLSBxPUT2Jc/XXo5Syjq5yUF/SDon3jkwWzzKPnniDKDbzASQk2Fz8lRTFrlynnIknp3W3DfdUb9Z2YAf1zRw7kyUdS/ak7ai1Q5L0vJrx/SzzmofQIM5iywXUqLjOr+OLvOR0XG8IAQFJZQhB9sTkkFtnDwXpYPWWC8gUQEi4fKHy/GHUQ15yOqzXS9K8gjQudeHjnorS0cmR3hSA+QBCwbVTyaXt1aeNNa/ZzA7rdW5Hd/guOIvwSfsO+HKaegw4KAOmAELBtVPJ1Q/Dd7RDPXK6VEQOGw7nAQGO3gVnET4u3gGfTlNXAQcNENZZL6YAQsFHT9jFDyOLaIda5PShiBw1HIkjk3rC0bvgbGTi4h0IPW9Vg4R11otFAYVCqDMQXUU7pLX/Zp2jxUNkUs3OVkfvgrMIHxfvQKjvdxclywHUhTmBjb5J23i7cNSluEbdUS4eHIuJHcUpn72zCJ9QnNK+5GhgZ7KtCFZJKC+yS0LPX+6id5WwB5kod5CH3mCi9ZUdxLg7W9AnhBz2PmP+Qx+heKJcPoBGtPMV4Z5c2X8T2OwT2cHrkbdGx2EiR3FS27uLiJ0QnaG+/VGeZq+nwvN3US4F0CDTt/ehCPeUY6heooa3VnnrUL6JHMVJFGfaDkHIHYrQHcmuyeC7KJcJqEGmb+9DUe4pJxNC4pQdtchbp6mobnNMErNEWvNVyM72AM00XmdRZ/Bd5OYEFpFZwHeBfsBtqnptb8c7cwKbD6A4siS4VqIfZFKZkzgOfX9XaZ2ZWTpDC+549TIDvBJHzye4mcAi0g+4BTgDmAicLyITM6k8RDtfWlLck9Nc8C6HrAmulWjiU9rc/5WmIujdXpu0rnrswGnNbVma63IyX7p6571nUs3gu8jLBzANWKeqr6jqX4D7gNk5yeKWUB1oVXCeC97lDzrBtRL9IOuop2rD0aV8oe/GPcnzSaI00nZy6j0/6Tufg03f5TufSUp4zx3WvBTASOD1is+dcdk+iMg8EVkhIis2b97cfXd4ZOC0cdljd96DyXlVsUQ/yBrr6bPhqKVxT/J88ooEqhWXI6gMRuUu33nnM8BzIC8FIFXK9nNGqOpiVW1V1dYRI0ZkIFZKPA9pXffYnfdgXP6gE1wr0Q+yxnr6bDh6atwrG+IkzyePSKB6SPvOZ2ySdf3Ot41v4+Yzby5k4w/5RQF1AqMqPjcDb+Ykizs8R+QkmkzUC14WtXEZ7ZPgWokmPtVQT59pFapFqPQ0camPuvZzZGcYCVS3Ez2vKLSEzvRCLOSUIblEAYlIf+C3wCeBN4DngM+q6ks9nVOYVBAeozy8Rx1kgI+wudTXrPE7q1pPb+cmWHPAyXecMHokr3QVdVPw6KGsCS4VhKq+LyKXAo8ShYF+v7fGv1B4jHfPuvfiurFOlJbB9zXrSC+w3+iir3MT9I6djPISxssnrjvrOR4lTdzmg9xmAqvqw6p6rKqOVdWFeclRNLKyOTqPEMJP2Fzqa6axYfd1bgKbf1ob9Z4ggS1P1m1bzySqBQeBDGWbEeyRcqWCMGrGR2Pto4FJfc00jUkt58ZOzqVbnqyp0UsTWZJWaWcR1eKkY5F19FBGEVVO5+TUiKWDLjA+p6H78jcU2QeQ9NysbOuXPnwptzx3y57Pl0y9hJvPvLnmW6mXJM89axlTk5G/wad/L7iZwEZ6fJhoKvHVG6zFhFVvTyi1WaxyMle9Pb0awhgTjaYShHJmZcKB5O9fljI6IaPZyt5nFfeAKQCfeBw6ZvHC1NOwuhq+plFsqWRIGDtfS51ZpYJOpLQTvqNJ37/CTZ7KyN+Ql2I0E5AvPA8dQwoJdSlLUhNBahk8h2yGsCqZyzpCev96omirofky6QYXBloKPIeqhTShxeUEtaTr2KaWwXPIZiapoOslxTvq6/1z1Qg6DTnOKMzV2eptddD4JqC8krNlMHTsyUSTdTSBy+FrUhNBahmyDNms9Z30nSYh5TvqOiTZpV8rL5t60WhsE1DeMwZzyNOf19Dc68IYLmVw/J3kYtrJec0FX7iMECqCiSorejMBNbYCSGDXLTpFCrPLXGnk3SGA9O9kCPfgCdeNdgidkhAobxhoCWcMpjGFZGk68h3GWhWXIX155cBPeA9evlvH5lXXEUJFz9SZBY09AoCghrhZkaTnk/WQOZeRiqves4tlF9NMPKuzbi/fbQOPRBqN8o4AoDGXgOyDJD2frJ1mucQ9u0ohkGcO/AT34OW7zWk5xyKRR2qHeml8BWDURNYNsovhfqIfmIsOQd6mxTrvwct3m/cz6IFQGt1cTJwJaHwTUIMSZE4dT3L1VI8zs0YSk4xv02Le0Uq1kFRGT88upMifkIIxbCJYg+Ejrz6kn4jiS65qOJt8Vsd6APuQZGUv3zL1QupJRtUa7SQTpDzcWxeuV8xLQ9IJjVljJiCXZDTpzKe9Ps0QOks/gjOzhidbdioTQGj2dZdrDHu8t5ASzRUl55EpAFdkuBC3rxc9rd0yyx+gsx+YJ1t2KmWYdCF4X50Pl422R99BaI1uEcJQzQfgiownnfmw6bqwWyYNQc11wo4Hm3Rqe3Q9MvkOyXR9/SKGZhdR5pjyzgTOkgaIi87DiRaS4841mSm2LDof9TaABW4w96Pgv21zAmdBFtkbPVNLhkfXjVpIjruaqbFxyyy7Y4JMpj3S073V4/D16OjNhQZehN58AC4JbNJZEodub3ZLH7HNrv0G3uPAM/T11IyrCW6u7i00J3ZaAp3z4AJTAA2Kj8baR5SPS8ddJpNvQm3cXHQ+XN2b5wYz88leWS9CnyGpFICIXCciq0XkNyLyYxEZUrFvgYisE5E1InJ6RfkUEXkx3neTiEgaGYzq+GisfUX5uIqWyCQMtYF7g87uzWODmdsM28BG965IOwL4OTBJVY8HfgssABCRicAc4DhgFrBIRPrF53wPmAeMi/9mpZTBqIKPxrqyt37Z9MtoX98e1BT3TMJQ8+gNZrWoUU/3lqR+Tw2mLfTillROYFWtfPrPAH8bb88G7lPVHcCrIrIOmCYiG4DBqvo0gIjcCZwDPJJGDmN/fC3Z13WdrGb81kOae67LuZ3REoGAe4dqXw7s7vcWmEO3KDNsi4LLKKAvAj+It0cSKYQuOuOynfF29/KqiMg8otECo0ePdihqOfAVheI7cidNpFGSe3aZwsJ56KfLCJQkjXlgETAhrYXdCPRpAhKRx0RkZZW/2RXHXAW8D9zdVVTlUtpLeVVUdbGqtqpq64gRI/oSNVzyWpfYEz5NLXnYeF2ZFbzI7tLnkMTJG6DPowgzbItCnwpAVU9T1UlV/h4EEJG5wNnABbp3VlknMKriMs3Am3F5c5XyxiWAsEHXURM+p9znYeN1pdC8yO7S55CkMW+0CJgG64ylJW0U0CzgvwJtqrq9YtdSYI6IDBCRFiJn73JV3QhsE5HpcfTP54EH08gQPDmHDfrqUfvqheWR0MuVQvMmuyuHatLG3GMETKYhnQF0xkIjVSqI2Lk7ANgSFz2jql+K911F5Bd4H7hMVR+Jy1uB24GDiJy/X9YahAg+FURP5DyNPKS85LWSe26gFBRZ9qzJPA1Ixvm69pBzWgxvqSBU9a962bcQWFilfAUwKU29hSLnFBFFjJrILIWCB4ose9ZkngbEZcqMWgksiqo7NhM4C3KcRBJailwXhLLsnwsa6V7qJXNzXx7+jFBnjsdYNlCjUDRS9tCi3ItPs1bDm8wCyCTamwnIRgBGejKMrMgkSqgBVnZzhe+w3IYP6Qw8isoUgJGOjCMrvJsNiriym0eFVQQlFTwB5xGy9QCMdGQ8U9T7TNAM78fJvXh2MhYxiMCoHfMBGOkIwMbplKLdTwahjQ1vp29wbElIwy+NtPwfFOt+iqawUmCKKBmmAAwja7JUIkVSWAkpSsRUiFgUUFmxvCf5kHXKgYCdjK4wZ7QfTAE0KkXOe1J0xRX45J8ikkeOqDJgUUCNSmB53Gsm8KnzNZFHyoEsycHkZOsA+MEUQKNS1EYoC8XluwHLOf+TV3JU0JZnyT2mABqVojZCvhVXVg1YlstGZklRR5ZGVUwBNDJFbIR8K66iN2B5R/wUdWRpVMUUgBEePhVXkRuwEPwjRR1ZGlUxBWCUiyI3YKGMXoo4sjSqYgrAKB9FbcCKPHoxgsQUgGEUhSKPXowgMQVg+CVvp2UWZHmPRR29GEFiM4ENfxR5NnKtlOEejYbFFIDhjzKkRCjDPULx03MYVXGiAETkH0VERWR4RdkCEVknImtE5PSK8iki8mK87yYRERcyGAEy9NDIWQmN67Qswz3aKKdhSa0ARGQU8CngdxVlE4E5wHHALGCRiPSLd38PmAeMi/9mpZXBCJTA10N1QhnusSyjnBLiYgTwHeAKoHJhgdnAfaq6Q1VfBdYB00TkKGCwqj6t0UIEdwLnOJDBCJUSpCpu+HsswyinpKSKAhKRNuANVX2hmyVnJPBMxefOuGxnvN293DCMULHw04alTwUgIo8BR1bZdRVwJVAtMXc1u772Ut5T3fOIzEWMHj26L1ENw/CFhZ82JH0qAFU9rVq5iEwGWoCu3n8z8CsRmUbUsx9VcXgz8GZc3lylvKe6FwOLIVoSsi9ZDcMwjNpJ7ANQ1RdV9XBVHaOqY4ga9w+r6u+BpcAcERkgIi1Ezt7lqroR2CYi0+Pon88DD6a/DcMwDKNevMwEVtWXROR+4GXgfeASVd0V774IuB04CHgk/jMMwzAyxpkCiEcBlZ8XAgurHLcCmOSqXsMwDCMZNhPYMAyjpJgCMAzDKCmmAAzDMEqKKQDDMIySYgrAMAyjpJgCMAzDKCmmAAzDMEqKKQDDMIySYgrAMAyjpJgCMAzDKCmmAAzDMEqKKQDDMIySYgrAMAyjpJgCMAzDKCmmAAzDMEqKKQDDMIySYgrAMAyjpJgCMAzDKCmmAAzDMEqKKQDDMIySYgrAMAyjpJgCMAzDKCmpFYCIfFlE1ojISyLy7YryBSKyLt53ekX5FBF5Md53k4hIWhkMwzCM+umf5mQROQWYDRyvqjtE5PC4fCIwBzgOOBp4TESOVdVdwPeAecAzwMPALOCRNHIYhmEY9ZN2BHARcK2q7gBQ1U1x+WzgPlXdoaqvAuuAaSJyFDBYVZ9WVQXuBM5JKYNhGIaRgLQK4FjgYyLyrIj8UkSmxuUjgdcrjuuMy0bG293LDcMwjIzp0wQkIo8BR1bZdVV8/lBgOjAVuF9EjgGq2fW1l/Ke6p5HZC5i9OjRfYlqGIZh1EGfCkBVT+tpn4hcBPwoNucsF5HdwHCinv2oikObgTfj8uYq5T3VvRhYDNDa2tqjojAMwzDqJ60J6CfAqQAicixwIPAWsBSYIyIDRKQFGAcsV9WNwDYRmR5H/3weeDClDIZhGEYCUkUBAd8Hvi8iK4G/AHPj0cBLInI/8DLwPnBJHAEEkeP4duAgougfiwAyDMPIAYna6/BpbW3VFStW5C2GYRhGoRCR51W1tdo+mwlsGIZRUkwBGIZhlBRTAIZhGCXFFIBhGEZJMQVgGIZRUkwBGIZhlBRTAIZhGCXFFIBhGEZJMQVgGIZRUkwBGIZhlJS0uYByZefOnXR2dvLee+/lLYp3Bg4cSHNzM01NTXmLYhhGg1BoBdDZ2ckhhxzCmDFjaOSlhVWVLVu20NnZSUtLS97iGIbRIBTaBPTee+8xbNiwhm78AUSEYcOGlWKkYxhGdhRaAQAN3/h3UZb7NAwjOwqvAAzDMIxkmAJIyU033cSECRO44IILqu5ftmwZZ599dsZSGYZh9E2hncAhsGjRIh555BFzzhqGUThKNwJYumYplz58KUvXLE19rS996Uu88sortLW18a1vfYuTTz6Zk046iZNPPpk1a9bsd/wvf/lLTjzxRE488UROOukktm3bBsB1113H1KlTOf744/n617+eWi7DMIxaKNUIYOmapZz/wPls37mdJR1LuPfce2kb35b4erfeeis/+9nPeOKJJzjwwAP52te+Rv/+/Xnssce48soreeCBB/Y5/vrrr+eWW25hxowZvPvuuwwcOJD29nbWrl3L8uXLUVXa2tp48skn+fjHP572dg3DMHqlVAqgfX0723duB2D7zu20r29PpQAqeeedd5g7dy5r165FRNi5c+d+x8yYMYPLL7+cCy64gE9/+tM0NzfT3t5Oe3s7J510EgDvvvsua9euNQVgGIZ3SmUCmjl2JoOaBgEwqGkQM8fOdHbtq6++mlNOOYWVK1fy05/+tGrM/vz587ntttv480OFQbwAAAncSURBVJ//zPTp01m9ejWqyoIFC+jo6KCjo4N169Zx4YUXOpPLMAyjJ0o1Amgb38a9595L+/p2Zo6d6az3D9EIYOTIkQDcfvvtVY9Zv349kydPZvLkyTz99NOsXr2a008/nauvvpoLLriAgw8+mDfeeIOmpiYOP/xwZ7IZhmFUI5UCEJETgVuBgcD7wMWqujzetwC4ENgFfEVVH43LpwC3AwcBDwNfVVVNI0c9tI1vc9rwd3HFFVcwd+5cbrjhBk499dSqx9x444088cQT9OvXj4kTJ3LGGWcwYMAAVq1axUc/+lEADj74YO666y5TAIZheEfStL0i0g58R1UfEZEzgStU9RMiMhG4F5gGHA08BhyrqrtEZDnwVeAZIgVwk6o+0lddra2tumLFin3KVq1axYQJExLLXzTKdr+GYaRHRJ5X1dZq+9L6ABQYHG8fCrwZb88G7lPVHar6KrAOmCYiRwGDVfXpuNd/J3BOShkMwzCMBKT1AVwGPCoi1xMpk5Pj8pFEPfwuOuOynfF29/KqiMg8YB7A6NGjU4pqGIZhVNKnAhCRx4Ajq+y6Cvgk8A+q+oCI/B3wL8BpQLXMZdpLeVVUdTGwGCITUF+yGoZhGLXTpwJQ1dN62icidxLZ8wF+CNwWb3cCoyoObSYyD3XG293LDcMwjIxJ6wN4E/ibePtUYG28vRSYIyIDRKQFGAcsV9WNwDYRmS5RfuPPAw+mlMEwDMNIQFofwH8Bvisi/YH3iO31qvqSiNwPvEwUHnqJqu6Kz7mIvWGgj8R/hmEYRsakUgCq+n+BKT3sWwgsrFK+ApiUpt5Q2Lp1K/fccw8XX3xx3qIYhmHUTalSQbhm69atLFq0aL/yXbt2VTnaMAwjLMqnAN7aCmtfi/6nZP78+axfv54TTzyRqVOncsopp/DZz36WyZMns2HDBiZN2jvQuf766/nGN74BRCkhZs2axZQpU/jYxz7G6tWrU8tiGIZRL6XKBcRbW2HVK7B7N/x+C0w4BoYPSXy5a6+9lpUrV9LR0cGyZcs466yzWLlyJS0tLWzYsKHH8+bNm8ett97KuHHjePbZZ7n44ov5xS9+kVgOwzCMJJRLAbz9TtT4Q/T/7XdSKYDuTJs2rc+Vwd59912eeuopzjvvvD1lO3bscCaDYRhGrZRLAQw9NOr5794NBxwQfXbIBz7wgT3b/fv3Z3eXsoE96aF3797NkCFD6OjocFq3YRhGvZTLBzB8SGT2OXpEavMPwCGHHLJnWcfuHHHEEWzatIktW7awY8cOHnroIQAGDx5MS0sLP/zhDwFQVV544YVUcgSHQz+LYRj+KNcIAKJG35HZZ9iwYcyYMYNJkyZx0EEHccQRR+zZ19TUxDXXXMNHPvIRWlpa+NCHPrRn3913381FF13EN7/5TXbu3MmcOXM44YQTnMiUO479LIZh+CNVOugssXTQBbnfta/Bm5v3fj56BIz7YH7yGEbJ8ZkO2jD2ZeihkX8FvPhZDMNwR/lMQIZfuvwsb78TNf5m/jGMYCm8AlBVorxyjU1RTHWAUz+LYRj+KLQJaODAgWzZsqVYjWMCVJUtW7YwcODAvEUxDKOBKPQIoLm5mc7OTjZv3tz3wQVn4MCBNDc3932gYRhGjRRaATQ1NfU589YwDMOoTqFNQIZhGEZyTAEYhmGUFFMAhmEYJaUwM4FFZDPwWh+HDQfeykCcpIQsX8iyQdjyhSwbhC1fyLJB2PLVKtsHVXVEtR2FUQC1ICIrepryHAIhyxeybBC2fCHLBmHLF7JsELZ8LmQzE5BhGEZJMQVgGIZRUhpNASzOW4A+CFm+kGWDsOULWTYIW76QZYOw5UstW0P5AAzDMIzaabQRgGEYhlEjpgAMwzBKSuEVgIj8o4ioiAyvKFsgIutEZI2InF5RPkVEXoz33SSe8kiLyP8Ukd+ISIeItIvI0aHIFtd1nYisjmX8sYgMqdiX97M7T0ReEpHdItLabV/uz66KvLNiedaJyPys6q2o//sisklEVlaUHSYiPxeRtfH/oRX7qj5Dj/KNEpEnRGRV/L1+NRQZRWSgiCwXkRdi2f57KLJV1NdPRH4tIg95kU1VC/sHjAIeJZogNjwumwi8AAwAWoD1QL9433Lgo4AAjwBneJJrcMX2V4BbQ5Etrmsm0D/e/hbwrVDkAyYA44FlQGtFee6yVZG1XyzHMcCBsXwTM/4NfBz4MLCyouzbwPx4e34t369H+Y4CPhxvHwL8NpYjdxnj9+XgeLsJeBaYHoJsFTJeDtwDPOTjuy36COA7wBVApSd7NnCfqu5Q1VeBdcA0ETmKqGF+WqMndidwjg+hVPVPFR8/UCFf7rLF8rWr6vvxx2eArjzTucunqqtUdU2VXbnLVoVpwDpVfUVV/wLcF8uZGar6JPDHbsWzgTvi7TvY+zyqPkPP8m1U1V/F29uAVcDIEGTUiHfjj03xn4YgG4CINANnAbdVFDuVrbAKQETagDdU9YVuu0YCr1d87ozLRsbb3ct9ybdQRF4HLgCuCUm2bnyRqNcMYcrXRYiy9SRT3hyhqhshaoCBw+PyXOUVkTHASUQ97SBkjE0sHcAm4OeqGoxswI1EHdzdFWVOZQt6PQAReQw4ssquq4AriUwZ+51WpUx7KXcum6o+qKpXAVeJyALgUuDrWclWi3zxMVcB7wN3d52WhXy1yFbttCxkq5M8605CbvKKyMHAA8BlqvqnXtw0mcqoqruAE2M/2I9FZFIvh2cmm4icDWxS1edF5BO1nFKlrE/ZglYAqnpatXIRmUxk53ohfpGagV+JyDQizTeq4vBm4M24vLlKuVPZqnAP8K9ECiAT2WqRT0TmAmcDn4xNJ2QlXx3PrpLMnp0DmfLmDyJylKpujE1km+LyXOQVkSaixv9uVf1RiDKq6lYRWQbMCkS2GUCbiJwJDAQGi8hdzmXz6cDI6g/YwF4n8HHs6wx5hb3OwueInDxdzsIzPckzrmL7y8D/CUW2uK5ZwMvAiG7lQcgX17eMfZ3AwchWIVP/WI4W9jqBj8ui7m5yjGFfJ/B17Oso/HZfz9CjbELkl7mxW3nuMgIjgCHx9kHAvxF1inKXrZucn2CvE9ipbJm+qB4f0AZiBRB/vorIC76GiogQoBVYGe+7mXgmtAd5Hojr+Q3wU2BkKLLFda0jshd2xH+3hiIf8B+JejM7gD8Aj4YiWw/ynkkU2bKeyISV9bt/L7AR2Bk/twuBYcDjwNr4/2F9PUOP8v01kSniNxXv25khyAgcD/w6lm0lcE1cnrts3eT8BHsVgFPZLBWEYRhGSSlsFJBhGIaRDlMAhmEYJcUUgGEYRkkxBWAYhlFSTAEYhmGUFFMAhmEYJcUUgGEYRkn5/yG7q4BsDpvIAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "colors = ['pink', 'green']\n", "\n", "# Find points belonging to each category and plot them\n", "for category, category_id in sorted(category_to_id.items()):\n", " points = projected_features[(labels[indices] == category_id).values]\n", " plt.scatter(points[:, 0], points[:, 1], s=10, c=colors[category_id], label=category)\n", "plt.title(\"tf-idf feature vector for each article, projected on 2 dimensions.\",\n", " fontdict=dict(fontsize=5))\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "from sklearn.model_selection import cross_val_score\n", "\n", "\n", "models = [\n", " \n", " RandomForestClassifier(n_estimators=500, max_depth=4, random_state=0),\n", " MultinomialNB(),\n", " LogisticRegression(random_state=0),\n", " KNeighborsClassifier(n_neighbors=3)\n", "]" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "CV = 5 # Cross Validate with 5 different folds of 20% data ( 80-20 split with 5 folds )\n", "\n", "#Create a data frame that will store the results for all 5 trials of the 3 different models\n", "cv_df = pd.DataFrame(index=range(CV * len(models)))\n", "entries = [] # Initially all entries are empty\n", "\n", "#For each Algorithm \n", "for model in models:\n", " model_name = model.__class__.__name__\n", " # create 5 models with different 20% test sets, and store their accuracies\n", " accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)\n", " # Append all 5 accuracies into the entries list ( after all 3 models are run, there will be 3x5 = 15 entries)\n", " for fold_idx, accuracy in enumerate(accuracies):\n", " entries.append((model_name, fold_idx, accuracy))" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEHCAYAAACEKcAKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXxU9b3/8ddnspNEdtnCHlyoC1epW+0VrtViWy22P+tSfyJqubQq7b1at/a63NqfWreKWrxei6ht1bbuigpal4qAgKIgskRAiGxBRJYkZJnP749zAmOcJJNMhknC+/l45JGZM+d7zme+c+Z8zvd7znyPuTsiIiItFUl3ACIi0r4pkYiISFKUSEREJClKJCIikhQlEhERSUpmugNoTT169PBBgwalOwwRkXZjwYIFm929ZzLL6FCJZNCgQcyfPz/dYYiItBtm9kmyy1DXloiIJEWJREREkqJEIiIiSVEiERGRpCiRiOxDNm/ezLp169AYe9KaOtRVWyIS38aNG7n55ptZsGABAAMGDODyyy9nxIgRaY5MOgK1SEQ6uGg0ypVXXsmCBQvwDCeaE2XNmjX88opfsmHDhnSHJx2AWiQiHcDkyZMpKSmJ+9r27dtZuXIl0fwo28Zuw7Od/FfyYS1ceuml9OnTB4DS0lIAioqKkoqluLiYSZMmJbUMaV/UIhHp4KqrqwGo6VGD5zpEoKZvzZdeA6ioqKCioiItMUr7phaJSAfQWAtgzZo1nHvuuWSVZpG9Iptopyg5S3IAuOiii/je9773pWVMnjw59QFLh6IWiUgHN2DAAL73ve9htUb+m/kUvlRIxvYMiouLOemkk9IdnnQAapGI7AMuu+wyDjjgAGbOnEllZSXHHHMMZ511Fjk5OekOTToAJRKRfUBGRgZjx45l7Nix6Q5FOiB1bYmISFKUSEREJClKJCIikhQlEhERSYoSiYiIJEWJREREkpLSRGJmY8xsmZmVmNlVcV7vamZPmdkHZvaOmR2SaFkREWkbUpZIzCwDuBc4BRgOnG1mw+vNdg2w0N0PA84D7mpGWRERaQNS2SI5Cihx95XuXgU8Bny/3jzDgVcB3H0pMMjMeiVYVkRE2oBUJpJ+wNqY56XhtFjvAz8AMLOjgIFAUYJlZR+wdetWPv74YyorK5tdtqysjJUrV1JTU5OCyESkTiqHSLE40+rf3/Nm4C4zWwgsAt4DahIsG6zEbAIwAYLB6aRj2LVrF3feeScvv/QStdEo+fn5nHfeeZx11lmYxds89ti6dSu//e1vmTt3LgDdunXj0ksv5cQTT9wboYvsc1LZIikF+sc8LwLWxc7g7tvcfby7jyA4R9ITWJVI2Zhl3O/uI919ZM+ePVszfkmje++9l+nTp4PX0i9vFzt37mTKlCm88sorTZa97rrrmDt3LjmRKPvnVLFlyxZ+85vfsGTJkr0Quci+J5UtknnAMDMbDHwKnAWcEzuDmXUBysPzIBcBb7r7NjNrsqykV2N35EtEaWlpgzdRcnfKy8sBmHLkCg7ar4K/re3B3Sv68dvf/pY77rijweVGo1EqKirIz6zl4aOX0iO7hjuX9+PpT3twySWXxB3tNi8vT3cFFElCyhKJu9eY2SXAy0AGMNXdPzSzieHr9wEHAw+bWS2wBLiwsbKpilWar6SkhOWL32VAQW2LyteWR4jWxu+icgeIUJBZywGFQbI5ouuO4LVoLdHK7Q0vNyzbP28XPXOCcyMjuuzg6U97EK2pJupVXy1TvY3K1etb9D4A1uzIaHFZkY4gpcPIu/t0YHq9affFPJ4NDEu0rLQtAwpq+fXIHa2+3KjDZbP2o6wyg3tW9OVfe37Bw6t7AXBs7yp+dkh5g2V3VBuX/rMzS7d34rE1PRlaULG77A+GVDJ2SPNP2jflxvkFrb5MkfZE9yORNidicNawCu5ZlM/fS3vy99Lg3FenzChjBzeeCAqynLGDK/nbx3n8oaTv7un759Xyrf67Uhq3yL5KiUTapKN7VdM5ewcvr81hc2WEwYW1fHdgJb06RZss+/3BlfTNr+W1T3PYWW0c3LWG7wyspCAr7oV/IpIkJRJpsw7qWsNBXVv2G5Cv71/N1/evbuWIRCQeDdooIiJJUSIREZGkKJGIiEhSlEhERCQpSiQiIpIUJRIREUmKEomIiCRFiURERJKiRCIiIknRL9ulTXOH9eURqmqN/gW1ZOjQp1WVlZWxadOm3cPv5+bmpjskaYeUSKTN2lAe4d7F+azaFmymXXOiXHjwTkb00K1zk1VeXs4tt9zC66+/jrvjOIaRm5tLSUkJxcXF6Q5R2hEd30mbVBuFW98rYNW2TPbLrKFXThWf74rw+/cL2FiuzTZZd9xxB6+99hpRi1LbpRYL725dWVnJ5ZdfTmVl6w+3Lx2XWiTSIqWlpezcnpGye3HsrDY2VmTQN3cXU49aTm5GlOsXD+T1si78vwUF9MxrehTgveWT7Rnkl5a2uPwFF1zA+vUtv7FWc9XdgdLN2Xb6NqJdouQszqHT3E5EM6Ns2bKFU089lczM9Owe+vTpw9SpU9OybmkZJRJpk2rCEd+HFFTSKTNIGsM7l/N6WRdqPP6dFdurrVu3sqN8x977NjoYhuc40c5B3dbsH3YXho29yupKSEeurgnqQ9oXJRJpkaKiIipr1qfkDokAG8sjXPZ2Z2Z/th9PlHane3YNf1vbA4Cziiv4Zt+v3jI3XW6cX0BuEvd8LyoqoszKiI7aS3tuh8gLESIVEfLm5FHdv5q8hXkAWHWQpKP/FoWueyecWJHXIxT1a3ldSnookUib1KtTlDH9K3lpbS53Ld+zYxnWuYZje7edJNIuGfjhDnMgd0kuuUuCK7Ucx9yIDkxPEpH2S4lE2qwfH1DBkM41zFqfQ1UUDu9ezbf67yJT59qT5v0dz3NshWGfW9CNlQ/RQVF8kO4kKc2jRCJtlhkc17ua43rrTocp0QO8R3Dpr0gydGwnIiJJUSIREZGkKJGIiEhSlEhERCQpSiQiIpKUlCYSMxtjZsvMrMTMrorzemcze87M3jezD81sfMxrq81skZktNLP5qYxTRERaLmWX/5pZBnAvcBJQCswzs2fdfUnMbBcDS9z9VDPrCSwzsz+7e90vzka7++ZUxSgiIslLZYvkKKDE3VeGieEx4Pv15nGg0MwMKAC2ABojXESkHUllIukHrI15XhpOi3UPcDCwDlgE/Nzd6wYccmCGmS0wswkNrcTMJpjZfDObX1ZW1nrRi4hIQlKZSOIN0Vr/J7TfBhYCfYERwD1mtl/42jfc/QjgFOBiM/vXeCtx9/vdfaS7j+zZs2crhS4iIolKZSIpBfrHPC8iaHnEGg886YESYBVwEIC7rwv/bwKeIugqExGRNiaViWQeMMzMBptZNnAW8Gy9edYAJwKYWS/gQGClmeWbWWE4PR84GVicwlhFRKSFUnbVlrvXmNklwMtABjDV3T80s4nh6/cBvwGmmdkigq6wK919s5kNAZ4KzsGTCfzF3V9KVawiItJyKR39192nA9PrTbsv5vE6gtZG/XIrgcNTGZuIiLQO/bJdRESSokQiIiJJUSIREZGkKJGIiEhSlEhERCQpSiQiIpIUJRIREUmKEomIiCRFiURERJKiRCIiIklRIhERkaQokYiISFKUSEREJClKJK1oy5YtzJs3j7Vr1zY9szRL1KHkiwyWbMmkqjbd0XRAXwAbgV3pDkTao5QOI7+viEajTJkyhb///e/U1gZ7uWOOOYZrr72WgoKCNEfX/n2yPYO7F+WzoTwDgPzMKBccXM7RvarTHFkHUAmR2RFsc3BnbI84fnDwF/dm2SJxqEXSCp577jkef/xxamqj7CrsSzQjmzlz5vD73/8+3aG1ezVRuH1hARvKM+iZU8Xg/Ap21kT4w+J81u3U5pusyLwgiUSzo9T0rMGiRuTDSHCjbJEEqUXSDJMnT6akpOQr05ctWwbAluJT2Ln/oWRWbKHPew8wY8YM1q9fT0ZGcCRdWhp8O4uKipKKo7i4mEmTJiW1jNawZkcGN85PbYtrR7WxZVeEAZ0qmXrUcrLMuXHJAGZu7Mq1cwvJy3R6dYqmNIamrNmRwQHJLmQrRF5PIjHuAGqaWcbBagzPcLb9YBue7+QszqHT3E5E5kZgQQtjyQRaullsBfq1sKykjRJJK6jrzqrq1BOAmpzOeEYWVltFNBrdnUgqKirSFmNrKy4u3ivrKd+yBXauoX+nXWRHHIDB+ZUA1ESyqcrKIXfQsL0SS0MOILn6aI26LC0tbfb2FY1GqaipwHMd7xTUbW3XYFvOsAzysvNaFEteXh5F/Vp4sNRv721b0nrM3dMdQ6sZOXKkz58/f6+v93e/+x3PP/88uwr6sL3v18nbUkL+5iUMGDCARx55hPDe87tbEZMnT97rMbZXGzZs4Mwzz8Q8yoVDNtA1u4b7P+7D1upMBg4cSNeuXVWfLVRTU8MZZ5zBZ599xq4Dd1Hdp5q89/PI+DyDs88+m5/+9KfpDlH2AjNb4O4jk1mGOplbwbhx4+jRowc5O9bTY/mz5G9eQmZWFpMmTdqdRKRlevfuzbnnnksU439X9uF3S/uztTqTkSNH0qVLl3SH165lZmYyadIkIpEIOctyKHi9gIzPM+jXrx9nn312usOTdkRdW62gV69ePPDAAzz99NMsW7aMXr16MXbsWIYMGZLu0DqEiy66iEMOOYSZM2dSWVnJsccey5gxY7jsssvSHVq7N3r0aPr27cszzzzDli1b+NrXvsbYsWMpLCxMd2jSjiiRtJJu3bpxwQUXpDuMDsnMOPbYYzn22GPTHUqHdOCBB3LFFVekOwxpx9S1JSIiSVEiERGRpCSUSMzsCTP7rpkp8YiIyJckmhimAOcAK8zsZjM7KJFCZjbGzJaZWYmZXRXn9c5m9pyZvW9mH5rZ+ETLiohI25BQInH3V9z9x8ARwGpgppm9bWbjzSwrXhkzywDuBU4BhgNnm9nwerNdDCxx98OBUcDtZpadYFkREWkDEu6qMrPuwPnARcB7wF0EiWVmA0WOAkrcfaW7VwGPAd+vN48DhRb82KIA2EIw0EMiZUVEpA1I6PJfM3sSOAh4BDjV3deHLz1uZg39lLwfEDueeilwdL157gGeBdYBhcCZ7h41s0TK1sU2AZgAMGDAgETejoiItKJEf0dyj7v/I94Ljfy0Pt5PuuuPx/JtYCHwb8BQgi6zfyZYtm799wP3QzBESgOxiIhIiiTatXWwme0ej8LMuprZz5ooUwr0j3leRNDyiDUeeNIDJcAqgpZPImVFRKQNSDSR/MTdt9Y9cffPgZ80UWYeMMzMBptZNnAWQTdWrDXAiQBm1gs4EFiZYNl2YfXq1cyaNYv169fj7pSXlzN79my2bNmS7tBERFpFol1bETMzD4cKDq+qym6sgLvXmNklwMtABjDV3T80s4nh6/cBvwGmmdkigu6sK919c7iOr5Rt/ttLn507d3LDDTcwZ84cIBjmIxKJUFtby5VXXklmZibnnnsu48eP18COItKuJZpIXgb+amb3EZyrmAi81FQhd58OTK837b6Yx+uAkxMt257cc889zJkzh2hGNlUFfcjZthavrSUayaKqoDc529Yybdo0hgwZwqhRo9IdrohIiyWaSK4E/h34KUHLYQbwQKqCSoULLriA9evXNz1jK6jrwgLYeOi5VHfqSd6WFfRc+iS12flsOuQcCte9Q9fVr3HDDTdwyy237JW4YvXp04epU6fu9fWKSMeTUCJx9yjBr9unpDac1Nm6dSs7dpZDxl4Y8NgdA9wi1OQE1yhU53YDIFJbHT7vDkBNbZQdlVWpjylWbQ1bt25tej4RkQQk+juSYcBNBL8yz62b7u7t5oYbRUVFbNyVSeXw7+2V9eUufoaMnWX0WP4MO7sfROGG9wCIRrLI3/gB+306F4DqfiOoLjpyr8S0O7Ylz1NU1HuvrlNEOq5Er9p6kKA1UgOMBh4m+HGiNKBq4DF4JJO8zz+mR8kL5OxYhwNZu7bS/eMXyarcQjSvK9W9D0l3qCIiSUm0nyfP3V8Nr9z6BLg+/OHgdSmMrV2LFvai4tAfkLnpIyKV24l26kZtYW8yP1+NVZVTW9iLmv0PhIxGL34TEWnzEk0kleEQ8ivCy3I/BfZPXVgdg+fuR/WAL4/sUtW5b5qiERFJjUS7tn4BdAImAUcC5wLjUhWUiIi0H022SMIfH/7I3X8J7CAY1kRERARIoEXi7rXAkaafX4uISByJniN5D3jGzP4G7Kyb6O5PpiQqERFpNxJNJN2AzwiGe6/jgBKJiMg+LtFftuu8iIiIxJXoL9sfJM6Npdz9glaPSERE2pVEu7aej3mcC5yObjQlIiIk3rX1ROxzM3sUeCUlEYmISLuS6A8S6xsGDGjNQEREpH1K9BzJdr58jmQDwT1KRERkH5do11ZhqgMREZH2KaGuLTM73cw6xzzvYmZjUxeWiIi0F4meI7nO3b+oe+LuW9EQ8iIiQuKJJN58e+GetSIi0tYlmkjmm9kdZjbUzIaY2Z3AglQGJiIi7UOiieRSoAp4HPgrUAFcnKqgRESk/Uj0qq2dwFUpjkVERNqhRK/ammlmXWKedzWzl1MXloiItBeJdm31CK/UAsDdPyeBe7ab2RgzW2ZmJWb2lRaNmf3SzBaGf4vNrNbMuoWvrTazReFr8xN9QyIisnclmkiiZrZ7SBQzG0Sc0YBjhbfovRc4BRgOnG1mw2Pncfdb3X2Eu48ArgbecPctMbOMDl8fmWCcIiKylyV6Ce+vgLfM7I3w+b8CE5oocxRQ4u4rAczsMeD7wJIG5j8beDTBeEREpI1IqEXi7i8BI4FlBFduXUZw5VZj+gFrY56XhtO+wsw6AWOA2FGGHZhhZgvMrMGkZWYTzGy+mc0vKytr8r2IiEjrSnTQxouAnwNFwELgGGA2X7717leKxZnWUHfYqcCset1a33D3dWa2PzDTzJa6+5tfWaD7/cD9ACNHjmy0u03apxkzZvDnP/+ZTz/9lMGDBzNu3DiOP/74dIclIqFEz5H8HPg68Im7jwb+BWjq8L8U6B/zvIiGb4Z1FvW6tdx9Xfh/E/AUQVeZ7GNefPFFbrzxRlatWkVVVRXLli3jmmuuYdasWekOTURCiSaSSnevBDCzHHdfChzYRJl5wDAzG2xm2QTJ4tn6M4WDQZ4APBMzLd/MCuseAycDixOMVToId+ehhx4C4N+HruOFby7m7AGbAHjkkUfSGZqIxEj0ZHtp+DuSpwm6mT6niVvtunuNmV0CvAxkAFPd/UMzmxi+fl846+nAjPBHj3V6AU+ZWV2MfwnP00gHMXnyZEpKShqdJxqNsm7dOgznzP5lZEbgzAFlPLpmf5YuXUpeXh4AkyZNSiqW4uLipJchsi9L9Jftp4cPrzez14DOQJM7dnefDkyvN+2+es+nAdPqTVsJHJ5IbB2GR4lsW49VVxAt6IXn6hYwZkZWVhbV1dU882kPTu33GU+XdgcgJydndyIR2ds2bdrEBx98QGFhIUceeSSZmfv2GLbNfvfu/kbTc0lzWOUX5C6bQaQyGKnfgZreh1I14CiweNcstH+JtgCeeeYZbr/9du5a0Y+7Vuy56O/qq69m1KhRKYpOpGEPPvggDz30ENFoFIDevXtz0003MXTo0DRHlj4tvWe7tBZ3ckpeI1L5BTU5nanoOhQsQtaGRWRsWZnu6NLutNNO4/LLL6dv374ADBw4kOuuu05JRNJizpw5PPjgg3htLYfs2sX+NTVs2LCB6667bndi2RftU+2xSPkWcpc8n7b1W+U2ADx3vz0TozVk7NxMbWYu60eMxzNyKFg/n26rXiVn1dtEN37U6nFEyrcAvVt9ualgZpx22mmcdtpp1NbWkpGRke6QZB/Q0Dm8Tz75BIBTd5bz3fJyqoFfd+/GmjVr+MlPfkJ+fv7ueUtLSwEoKipqcRzt5fzdPpNIiouL0x0CK1ZsB2DY0D078fLycpYv34xn5OCRbABqs4PzI4V5WRQPTcUOv3ebqI/mUhKRdKutrQWgS9j6yAIKos7WDL7SIqmoaOo32x2HuXec3/CNHDnS589vu+M71h1ZTJ48efe0mpoafvSjH7F582Z2dj+IqsK+FK6bR2bVdiZOnMg555yTrnBFpJ66c3aF0Sjf3lnOp5mZzM7LpaCggCeffJLc3Nzd88b7vrdFZrYg2fEMdY4kzTIzM7n88svJzMoi/7OldF39DzKrtjN8+HBOP/30phcgInvNKaecwhFHHMH2SIS/FxYwOy+XSCTCZZdd9qUksq/ZZ7q22rLjjjuOh6ZN48UXX+Tzzz/nsMMO48QTTyQ7OzvdoYlIjOzsbG677TZee+013nvvPQoLCxkzZgyDBw9Od2hppUTSRvTv358JE5oaUFlE0i0zM5OTTjqJk046Kd2htBnq2hIRkaQokYiISFKUSEREJClKJCIikhQlEhERSYoSiYiIJEWJREREkqJEIiIiSVEiERGRpCiRiIhIUpRIREQkKUokIiKSFCUSERFJihKJiIgkRYlERESSokQiIiJJUSIREZGkpDSRmNkYM1tmZiVmdlWc139pZgvDv8VmVmtm3RIpKyIibUPKEomZZQD3AqcAw4GzzWx47Dzufqu7j3D3EcDVwBvuviWRsiIi0jakskVyFFDi7ivdvQp4DPh+I/OfDTzawrIiIpImqUwk/YC1Mc9Lw2lfYWadgDHAEy0oO8HM5pvZ/LKysqSDFhGR5kllIrE407yBeU8FZrn7luaWdff73X2ku4/s2bNnC8IUEZFkpDKRlAL9Y54XAesamPcs9nRrNbesiCTp1VdfZdy4cYwaNYoJEyYwd+7cdIck7UgqE8k8YJiZDTazbIJk8Wz9mcysM3AC8Exzy4pI8l588UVuuOEGVq1aRTQaZenSpVxxxRW888476Q5N2omUJRJ3rwEuAV4GPgL+6u4fmtlEM5sYM+vpwAx339lU2VTFKrKvcncefPBBACpGVrD1nK1UDq/E3Xn44YfTHJ20F5mpXLi7Twem15t2X73n04BpiZQVkdZVWVnJhg0bcHMqD62ECFQeVknuklxWrVqV7vCkndAv20X2Ybm5ufTo0QNzI3t5NkQhZ1kOAEVFRWmOTtoLJRKRfZiZcc455wCQPyufLg92Ie+9PIDd06VhZWVlPP/887z44ots3bo13eGkTUq7tkSk7fvhD3/Izp07+eMf/4hhmBm/+MUvOOGEE9IdWpv29NNPc9ddd1FbWwtATk4Ov/rVrxg1alR6A0sDtUhE9nFmxrhx4zjttNMAOO200zj99NPTHFXbtnr1au68805qa2s5bNcuDqqqYteuXdx44437ZMtELRIRAeD888/nk08+4fzzz093KEmZPHkyJSUlKV3Hhg0bcHeOrajk/O3bceCuLp35CLj44ovp3r07K1asAGDSpEkpjaUpxcXFKY9BiUREAOjRowd33313usNIWklJCUsXLqR3CtdREf7P8ygQDMWRFw0G39i5di0Za9fu7u7ZunBhCiNp3Ia9tB4lEhHpcHoDF8Ydaal1rMW5H3gjLw8Dqsx4NzcHc2e8GV1SuO7m+GODo1K1LiUSEZFm6o9xPM5bZrzaqdPu6WPaUBLZm5RIRERa4NsYB+MsJbhq6RCg9z6YRECJRESkxQZgDEh3EG2ALv8VEZGkKJGIiEhSlEhERCQpOkeSYps2bWL27NmYGdXV1WRlZaU7JBGRVqVEkkLPPvvs7mEUIBiKon///k2UEhFpX9S1lSKlpaXccccd1NbWUt5tGBVdh+LurF27lk2bNqU7PBGRVqMWSTM0ZwyfjRs3Eo1G2dljOJ8dcCoAPZY+SactK/j5z39Oz549WxzH3hg7R0QkUWqRpIh7MDSBR/bkao9kfek1EZGOQC2SZmhOK+Djjz9m/Pjx5Jctwi2C4XTavAQz49Zbb9W5EhHpMNQiSZGhQ4dy4YUXYu4UblxIwcb3MWDixIlKIiL7qCqc93HexlmL43tpUMVUU4skhcaNG8dxxx3Hm2++iZlxwgknMHTo0HSHJSJpsAHnYYftMcNxHQL8H5yMdj5GlxJJig0bNoxhw4alOwwRSSPHeYIgiRRV1zCoppp5OTksjkQYDByV7gCTpEQiIh1KaWkp29l79+JIRDXBTabyo1Gu/PxzsoFh1dU8uN9+zAQWpSjW9cCO0tKULDuWzpGIiKRYXZqIsGenm1V3ZWc6AmplapGISIdSVFTE1s2bU3qHxOaK4kwGPotEuKtLZ4ZU1/BWXi4Ao4FvpCjWP+J0KSpKybJjpbRFYmZjzGyZmZWY2VUNzDPKzBaa2Ydm9kbM9NVmtih8bX4q4xQRSaUIxg+AXIfl2dm8lN+JHZEIQ2n/50cghS0SM8sA7gVOAkqBeWb2rLsviZmnC/AHYIy7rzGz/estZrS7b05VjCIie8sAjF+Y8wGwAxgADCNIMu1dKru2jgJK3H0lgJk9BnwfWBIzzznAk+6+BsDdNQiViHRY+RjHpjuIFEhl11Y/YG3M89JwWqwDgK5m9rqZLTCz82Jec2BGOH1CQysxswlmNt/M5peVlbVa8CIikphUtkjitdfqX6CQCRwJnAjkAbPNbI67Lwe+4e7rwu6umWa21N3f/MoC3e8H7gcYOXJkR7gAQkSkXUlli6QUiB0LpAhYF2eel9x9Z3gu5E3gcAB3Xxf+3wQ8Rcc4JyUi0uGkMpHMA4aZ2WAzywbOAp6tN88zwDfNLNPMOgFHAx+ZWb6ZFQKYWT5wMrA4hbGKiKRMFGcWzu9xbsJ5FKesQ/yCJJCyri13rzGzS4CXgQxgqrt/aGYTw9fvc/ePzOwl4AMgCjzg7ovNbAjwlJnVxfgXd38pVbGKiKTSi8CcmOdLgFUOPzOni67aapy7Twem15t2X73ntwK31pu2krCLS0SkPduO8447EeAn27bRv7qGR/YrZFl2NnOAMekOsBXol+0i0uFsIP1jbX0W/u8ERM0YVF3NEbuqABhdXsGy7GwWAJ+mMM4NQJeULX0PJRIR6VCKi4vTHQIAZStWANB94EA2LllCaWYmqzMz6VtTwzu5wfAoBT16pHQIky7snfqwjvk4PdYAABDXSURBVHTb15EjR/r8+RpNRUTSr+6OqpMnT+aGG27g1VdfBSDiTtSM7OxsHnjgAQYNGpTGKMHMFrj7yGSWoRaJiEgrcnfeffddNm7cSFZWFuXl5Vx11VV06dKF6S+8QEVlJQcddBAXX3xx2pNIa1GLRESklezatYtf//rXzJ07d/e07t27c/vttzNkyBBqamqoqakhN+zaagtao0Wi+5GIiLSSJ554grlz55IfjTK6vJz+1dV89tln3HzzzQBkZma2qSTSWtS1JSJSz+TJkykpKWl2ueXLlwNw3rbtjKiqYhdwRY/uLF26lIkTJ5Kdnd2s5RUXF+8+19KWqUUiItLKrN7/jk4tEhGRelraCvjTn/7E/fffz8P7FXJMZSUlWVlURiIccMABTJkyhXC0jg5HLRIRkVZyxhlnMHLkSHZEIrzSqROrs7Lo2rUrV111VYdNIqAWiYhIq8nJyeG2225j3rx5LFmyhJ49ezJ69Gjy8/PTHVpKKZGIiLSiSCTC0UcfzdFHH53uUPYadW2JiEhSlEhERCQpSiQiIpIUJRIREUmKEomIiCSlQw3aaGZlwCfpjqMJPYDN6Q6iA1F9ti7VZ+tqD/U50N17JrOADpVI2gMzm5/sSJuyh+qzdak+W9e+Up/q2hIRkaQokYiISFKUSPa++9MdQAej+mxdqs/WtU/Up86RiIhIUtQiERGRpCiRiIhIUtpUIjGzWjNbaGaLzew5M+vSSss938zuaaVlrTazRWGcC83suNZYbpz1jDCz79SbdoqZzTezj8xsqZndFk6/3swub8V1vx3z+FYz+zD8P9HMzjMzN7NHYubJNLMyM3s+gWXvCP8PMrNzYqaPNLPJrfUeGlj3aWZ2VRPz7N5WwnotN7P9Y17fEfO4bnt938zejd0WYudLIt5G66QldRiz/X5gZm+Y2cBk42wtMdtXbB1/x8xWmNmApj6PRpY7val9iZm9bmZfuUy3NfcdcZZ9efg9XhxuQ+c1FksL17F7mzCzHDN7JdxmzzSzB8xseGusp60NI1/h7iMAzOwh4GLgt+kNKa7R7t6sHxmZWaa71zSjyAhgJDA9LH8IcA/wXXdfamaZwITmxJAod49Njv8O9HT3XXUTzOwPwCFmlufuFcBJwKf1l9PEex4EnAP8JVznfGB+67yD+Nz9WeDZZhbbDFwGXBnntdjt9dvATcAJSQUZI4E6GUTL6nC0u282sxuAXwM/SSZOC+7YZO4eTWY57n5fuLw/hP9PBO4GTnb3NeGNoRr7PBpa7neanqv1NVYvZjaR4HtzlLtvM7POwNjWjqHeNvEvQFbdNgs83pxlmVmGu9fGe61NtUjqmQ30AzCzo8zsbTN7L/x/YDj9fDN70sxeCo9afldX2MzGm9lyM3sD+EbM9IFm9mp4RPaqmQ0Ip08zsylm9pqZrTSzE8xsanj0P62xQJtY5h1m9hpwi5kNDWNdYGb/NLODwvnOiDkqedPMsoH/Bs6sO3oArgB+6+5LAdy9xt3/ECeWn5jZvHBZT5hZp3jrCKd9zczeCdfxgZkNC6fXtRqeBfKBueERTGzLZzbwtpktAKYBr8S859lmVgLcEk5bbGaD6oV6M/DNcN3/YWajLGzRhOuZGh6ZrTSz3fc9NbP/DJe32Mx+EU4bFB7ZPRBO/7OZfcvMZoXbxVEx20tda+NUM5sbblOvmFmvBj7eqeHn0K2xbQDYD/i8sRksaGXOCev6KTPrGk7/ejhttgUtv8Xh9Ng6OcH2tILfM7PCJuqwwMwetD2tjx/GCSn2O9Yz3F7mhX/fiJk+04IW1/+Y2Sdm1iOs848s2Om/C/Q3s1+GZT+wIElhZvlm9kK43S0Ot2XM7GYzWxLO+5WWtZmNB14AaoDb6uoKiAITw+/QcmL2YWZ2bsz2/D9mlhFOX21mPcLH/xVuKzPN7FH7ckv+jLD8cjP7Zsz0/hZ8b5eZ2XUx62toW6xfL9PCeRaZ2X+Exa8Bfubu2wDc/Qt3fyjONjPFgl6ID+vqtJH6i/cdH2Vmz1vQivsTMCKsn6EW0/Ixs5PD7e9dM/ubmRXE1N21ZvYWcEacbSjg7m3mD9gR/s8A/gaMCZ/vB2SGj78FPBE+Ph9YCXQGcgmGR+kP9AHWAD2BbGAWcE9Y5jlgXPj4AuDp8PE04DHAgO8D24BDCTbUBcCIcL7VwCJgITA3gWU+D2SEz18FhoWPjwb+ET5eBPQLH3eJeW/3xNTNu8DhDdTb9cDl4ePuMdNvBC5tZB13Az8OH2cDebGfQ5zH1wOXAzuAucBLYb0vD+N7PnzPy4FfxpRbDAyq9xmPAp6PmWf383A9bwM5BENMfAZkAUeG7yMfKAA+JDjKGkSww4n9vKbGfJZP169ToCt7rlq8CLg9zjx17/da4IY49VFLsB0sBb4AjoxXbzHTPgBOCB//N/D7mPo5Lnx8M7A4Tp08B3wjfFxA0JvQWB3eUrf8uvcbs/32CB//HpgQPv4LcHz4eADwUfj4HuDq8PEYwMPPZBDBTv2Y8LWTCS51tfAzeB74V+CHwP/GxNEZ6AYsi6n/LvXquzr8PMfH1lX4+scE3+cbgO8ANeE8B4d1lBU+/wNwXux7JmjhLwTygEJgBXu+N6+zZxv4DvBKzPawHugellscLqexbTG2Xo4EZsa8/y7huj9vZD/4OjAyfNwtZp/4OnBYI/UX7zs+ij3bxO7HsesJ6+ZNID+cfiVwbUzdXdHUvrutdW3lmdlCgg9jATAznN4ZeMiCI2Yn2KnUedXdvwAwsyXAQIKKed3dy8LpjwMHhPMfC/wgfPwI8LuYZT3n7m5mi4CN7r4oLP9hGNPCcL76XVuNLfNv7l4bZvjjgL/Znns354T/ZwHTzOyvwJON1lDTDjGzGwk22ALg5UbWMRv4lZkVAU+6+4pmrOcwgs9iOcH7yAHWha99GL6WjBc86E7bZWabgF7A8cBT7r4TwMyeBL5J0F21qt7n9WrMZzkozvKLgMfNrA9BEl3VSCyTgYVmdnu96bFdW8cCD5vZIR5+A2NZ0HXRxd3fCCc9RLAtdAEK3b3uvNRfgO/FiWEWcIeZ/Zngsyq1xu8B/i3grLon7h7bWnrNghbYJoKurbr5h8csc7+w1XM8cHq4jJfMLHY5n7j7nPDxyeHfe+HzAmAY8E+CFsUtBDuxf1rQLVsJPGBmLxAknVjVBAmprgvmIYIDy7puyf8G/gd4mD0tkhMJdtrzwveQF76/WMcDz3jQHYuZPVfv9brvxQK+vM3MdPfPwjJPhstxGt4WY+tlJTDEzO4maGHNCOsm0e/Hj8xsAsGBQx9gOLCE+PXX0v3IMeFyZ4V1l02wb6jTZBdYW+vaqvtiDiR4MxeH038DvObuhwCnEhwF19kV87iWPed9Ev2gYuerW1a03nKjNO98Uuwyd4b/I8BWdx8R83cwgLtPJPhC9yfYYXWPs8wPCb4oTZkGXOLuhxIcteU2tA53/wtwGlABvGxm/9aM97iV4Og5n2AnNC7mtUq+vG3Ffl6Jive5NrbnrP95xX6W8T67uwlaHocSnAdqMEZ330qwg/9ZI/PMJjiAae7gd41mg5jl30zQcsoD5ljYLdrEchv6Dowm+I59SLBThuDzOjZm2+zn7tubiG9nzGMDboopX+zuf3T35ew5er/JzK714LzZUcATBOcFXqq33CjBuZCvm9k1cdb7GcHnEbvNGfBQzPoPdPfr65Vrqq7rtpnY/Qh8tR69iWXtrpcwgR9OcPR/MfCAB91ZO81sSGPBmNlgghbaie5+GEEiym2o/hLcj8RdFUGyrKu74e5+Ybz305C2lkiAoL8QmARcbmZZBC2SupO55yewiLnAKDPrHpaP7dt7mz1Haj8G3mqFkJtcZrjxrDKzMyA4EWdmh4ePh7r7XHe/luAL1B/YTtAErnMrcI2ZHRCWiZjZf8aJpRBYH77vH9dNjLeOcENe6e6TCY6kDmvGe14FbCTYES0Ghsa8tgk4IlzvEcDgOOXrv79EvAmMNbNOZpZPcKT8z2Yuo07sNjWusRlDdxAknLgHFOGOPYNgJ/cV4Tb9eUzf+/8F3gh3NNvN7Jhw+lnxyoef3yJ3v4Xg5OlBNF6HM4BLYsp3jX0xPCr/BXCeBed/6s9f1xp4C/hROO1kgi7BeF4GLojpW+9nZvubWV+g3N3/BNwGHBHO09ndp4cxjIizvC0EXbM/Jqj7N+q9fgfB51a3Q38V+D/huQDMrJt99Yq0t4BTzSw3jOG7DbyX+k4Kl5dHsOOeRYLbogXnZiLu/gTwX4TfC4ILM+41s/3C+fYLWx6x9iPYiX8RtiBPCeeNW38N7EcSMQf4hpkVh8vpVLefSVRb69razd3fM7P3Cb5YvyPo2vpP4B8JlF1vZtcTNM/WE/TfZ4QvTwKmmtkvgTJgfCuEm+gyfwxMMbNfE3TPPQa8D9wadtsZwRfifYJzPFeFXX03ufvjFpzQe9SCE+hOcIRS338RJNJPCI4C63Y08dZxFXCumVUDG9hzdJqIHwNTCJrb/06wMdaZC5wfxj6PoPurvg+AmvAznsaeLpEGufu7Flz48E446YFwOxnUjLjrXE/QtfRpGHu8ZBe77s1m9hTwHzGT67piIajXcb7nqpZOZlYaM2/dju++8PNbyZ7t5ELgf81sJ8GR6xdxQviFmY0mOFpeArxIcOTeUB3eSLCjWhyWuYF63R3h9+RRgiPlSeH8HxDsF94EJoblHrXgJPkbBN+n7QTdM7HLmmFmBwOzw+6RHcC5QDHBthcl6LL6KcE2+YyZ5Yb1FlundcYB9xEc7H6L4Mqy3Tu38PN4IVwe7r4k/F7NMLNIuK6LibmthLvPs+ACkvfD6fMbqOv63iLosi4G/uLBlVAkuC32Ax4MYwK4Ovw/haAO54Xfv2rgS12n7v6+mb1H0HJcSZDAoOH6i/cdb/IqQncvM7PzCT7nuu72XxP/exuXhkgRSTMzK3D3uivlrgL6uPvP0xwWEPz2AKh19xoLzgNN8T2Xj7Y7dXUdJvM3CS42eDfdcbV3bbZFIrIP+a6ZXU3wffyExLpv95YBwF/DI+oqkvzNSRtwvwU/wsslOKeiJNIK1CIREZGktMmT7SIi0n4okYiISFKUSEREJClKJCIikhQlEpEWsJiBAJOZR6QjUCIREZGkKJHIPsMSGG4+HArjaQuG555jZoeFZbub2QwLhnD/H2LGWrIGhi9PIJaPzOx/LRgifEY4BEdjtwJI6FYH1sCQ4CKpokQi+5pi4C6CccUOIrgx1PEEg+NdQzAkyHvhIHnXEIwwC3Ad8Ja7/wvBuGR195w5GDiTYIj3EQTDkewe46wJw4B73f1rBINg1t0z5El3/7q7Hw58RDCESp2uwL8RDIvxHHAn8DXgUAvud9KDYHiLb7n7EQTDgMQbk02k1eiX7bKvaWq4+YGEO3R3/0fYEulMcG+NH4TTX7A9w6knMnx5Y7HUjdUVO3R5Q7cCgKZvdVBE40OCi7Q6JRLZ1zQ13Hy8WwN7vf+x6oYvvzrOa82JpZYgCUEwAOPYcNC+8wluSFS/TEO3OqglGBL87BbEI9Ii6toS+bI3CbumzGwUsDm8BUDs9FPYM5x6IsOXN1fcWwEkKOkhwUWaSy0SkS+7nmDY7w+Acvbcq6RuOPV3CYZTXwOJDV/eAg3dCqBJrTEkuEhzadBGERFJirq2REQkKeraEkkhC+6b/Wqcl05097i35RVpb9S1JSIiSVHXloiIJEWJREREkqJEIiIiSVEiERGRpPx/rhtERkuVQYcAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Store the entries into the results dataframe and name its columns \n", "cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])\n", "import seaborn as sns\n", "\n", "sns.boxplot(x='model_name', y='accuracy', data=cv_df)\n", "sns.stripplot(x='model_name', y='accuracy', data=cv_df, \n", " size=5, jitter=True, edgecolor=\"gray\", linewidth=2)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "model_name\n", "KNeighborsClassifier 0.670310\n", "LogisticRegression 0.833198\n", "MultinomialNB 0.843455\n", "RandomForestClassifier 0.759649\n", "Name: accuracy, dtype: float64" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv_df.groupby('model_name').accuracy.mean()" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
model_namefold_idxaccuracy
0RandomForestClassifier00.666667
1RandomForestClassifier10.789474
2RandomForestClassifier20.815789
3RandomForestClassifier30.736842
4RandomForestClassifier40.789474
5MultinomialNB00.743590
6MultinomialNB10.868421
7MultinomialNB20.894737
8MultinomialNB30.894737
9MultinomialNB40.815789
10LogisticRegression00.692308
11LogisticRegression10.868421
12LogisticRegression20.921053
13LogisticRegression30.868421
14LogisticRegression40.815789
15KNeighborsClassifier00.641026
16KNeighborsClassifier10.657895
17KNeighborsClassifier20.736842
18KNeighborsClassifier30.631579
19KNeighborsClassifier40.684211
\n", "
" ], "text/plain": [ " model_name fold_idx accuracy\n", "0 RandomForestClassifier 0 0.666667\n", "1 RandomForestClassifier 1 0.789474\n", "2 RandomForestClassifier 2 0.815789\n", "3 RandomForestClassifier 3 0.736842\n", "4 RandomForestClassifier 4 0.789474\n", "5 MultinomialNB 0 0.743590\n", "6 MultinomialNB 1 0.868421\n", "7 MultinomialNB 2 0.894737\n", "8 MultinomialNB 3 0.894737\n", "9 MultinomialNB 4 0.815789\n", "10 LogisticRegression 0 0.692308\n", "11 LogisticRegression 1 0.868421\n", "12 LogisticRegression 2 0.921053\n", "13 LogisticRegression 3 0.868421\n", "14 LogisticRegression 4 0.815789\n", "15 KNeighborsClassifier 0 0.641026\n", "16 KNeighborsClassifier 1 0.657895\n", "17 KNeighborsClassifier 2 0.736842\n", "18 KNeighborsClassifier 3 0.631579\n", "19 KNeighborsClassifier 4 0.684211" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv_df" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "model = RandomForestClassifier()\n", "\n", "#Split Data \n", "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, train_data.index, test_size=0.33, random_state=0)\n", "\n", "#Train Algorithm\n", "model.fit(X_train, y_train)\n", "\n", "# Make Predictions\n", "y_pred_proba = model.predict_proba(X_test)\n", "y_pred = model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "category_id_df = train_data[['Category', 'category_id']].drop_duplicates().sort_values('category_id')" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 14.09375, 'Predicted')" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWkAAAELCAYAAAAbR3cfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deZxcVZn/8c83CSQhIexLWCSIKyCERVkCGhAQ3FEB85NN1KAC6riBjjMs48zgCCojCAmLRMDIGhEEZFGSgYFACIEAyYhiWIMh7AFJ0l3P7497Ciqhu6uqU/fW7e7vm9d9Vd1bdc853V08dXLuuc9RRGBmZuU0qN0NMDOz7jlIm5mVmIO0mVmJOUibmZWYg7SZWYk5SJuZldiQdjegJ8sXP+L5gfYmwzfZs91NsBLqWPakVrWMZmLOauu/dZXra0Spg7SZWaEqne1uwZs4SJuZVUWl3S14EwdpM7OqioO0mVlphXvSZmYl1tnR7ha8iYO0mVmVLxyamZWYhzvMzErMFw7NzMrLFw7NzMrMPWkzsxLrXN7uFryJg7SZWZWHO8zMSqyEwx1OVWpmVhWVxrceSBom6S5J90l6UNLJ6fhJkp6UNCdtH67XJPekzcyqWteTXgrsHRFLJK0G3Cbp+vTaTyPitEYLcpA2M0ui0poLhxERwJK0u1raepUf38MdZmZVlUrjWx2SBkuaAywCboqImemlYyXdL+kCSevUK8dB2sysqokxaUkTJc2q2SauUFREZ0SMBTYD3idpW+BsYCtgLLAQOL1ekzzcYWZW1USCpYiYDExu4H0vSLoV2L92LFrSucC19c53T9rMrKp1szs2kLR2ej4c2AeYL2l0zdsOBB6o1yT3pM3Mqlo3u2M0MEXSYLLO8GURca2kiySNJbuIuAA4ul5BDtJmZlUtSvofEfcDO3Rx/LBmy3KQNjOrKuEdhw7SZmZJhFdmMTMrL/ekzcxKzFnwzMxKzD1pM7MSa9HsjlZykDYzq/Jwh5lZiXm4w8ysxBykzcxKzMMdZmYl5guHZmYl5uEOM7MS83CHmVmJuSdtZlZiDtJmZiUWvVrQO1cO0mZmVR2e3WFmVl6+cGhmVmIekzYzKzGPSZuZlZh70mZmJeYgbWZWXtHZmoVoJQ0DZgBDyeLsFRFxoqR1gUuBMcAC4OCIeL6nsga1pEVmZv1BpdL41rOlwN4RsT0wFthf0q7ACcAtEfF24Ja03yMHaTOzqqg0vvVUTGZJ2l0tbQF8ApiSjk8BPlmvSQ7SZmZVlWh4kzRR0qyabWJtUZIGS5oDLAJuioiZwEYRsRAgPW5Yr0kekzYzq2riwmFETAYm9/B6JzBW0trANEnb9qZJDtIls3TpMo445jssW76czo5O9t1rD4794mHM//NfOeXHP2fpsuUMHjyYf/n2Mbxn63e2u7nWJptttgkXXnAGG228AZVKhfPOu4Sfn3l+u5vV97XowmGtiHhB0q3A/sDfJY2OiIWSRpP1snvkIF0yq6++Ghf896msscZwlnd0cPhXvs2eu+7MmeddxFeO+hx77vZeZvzvXZz+i/O58Mz/andzrU06Ojr4zndP5t45DzBy5AjumnkDN98yg3nzHm530/q2Fk3Bk7QBsDwF6OHAPsCPgN8BRwCnpser65XlIF0yklhjjeFA9j9iR0cHkpDEkldeBWDJK6+y4frrtbOZ1mZPP72Ip5/OOmFLlrzC/PkPs+kmGztIr6pKy+44HA1MkTSY7NrfZRFxraQ7gMskfQF4DDioXkG5BmlJNwEHRcQLaX8d4DcR8aE86+3rOjs7Ofior/HYk08x4VMfZbtt3sXxXz+ao7/5A0476zyiElw86fR2N9NKYostNmPs9tsy8657292Uvq9FCZYi4n5ghy6OPwt8sJmy8p7dsX41QAOkSds9Xs2svWJ63q+m5ty8cho8eDBXTjmLW6ZdxNyH/szDjyzg0mm/5/jjJnLLtIv47tcm8q//+bN2N9NKYMSINbjs0nP55rdP5OWXl9Q/wXrWxOyOouQdpCuS3lLdkbQF2VzBbkXE5IjYOSJ2/uLhE3JuXrmNWnMk791xO267cxa/u/5m9hk/DoAP7b0ncx/6vza3ztptyJAhXH7puUydOo3f/vb6djenX4hKpeGtKHkH6X8GbpN0kaSLyG6T/F7OdfZpzz3/Ai+lHtFrS5dy5933suUWm7PB+utx971zAZh5zxy22HzTdjbTSuDcyaczb/5f+NkZ3c4Cs2Z1dja+FSTXMemIuEHSjsCugIB/iojFedbZ1z3z7PP88w9Po7NSISrBh/bek/HjdmHUyBGcesYkOjo7Gbr66pz43a+1u6nWRuN2fy+HHfoZ7p/7ELPuvhGAf/mXU7n+hj+2uWV9XIHDGI1S5JA/VdK7ImJ+CtBvEhGzGyln+eJHyvcbs7Ybvsme7W6ClVDHsie1qmW8ctKEhmPOiJOmrnJ9jcirJ/1NYCLQ1RSEAPbOqV4zs94rYU86lyAdERPT4155lG9mlouBtsZhmsj9EbLcqa/XFRE/ybNeM7PeiI7iLgg2Ku87Dq8BXgPmAuX7ijIzqzVQhjtqbBYR2+Vch5lZa5QwSOc9T/p6SfvlXIeZWWu0KOl/K+Xdk76TLI/qIGA52VzpiIhROddrZta8Evak8w7SpwO7AXMjjwnZZmYtFAMwSD8MPOAAbWZ9wgCc3bEQuFXS9WSr5wKegmdmJTUAe9J/S9vqaTMzK6+BFqQj4mQASSMi4pU86zIzW1VlHJnNdQqepN0kPQTMS/vbS/pFnnWamfXaAEz6/zPgQ8CzABFxH/D+nOs0M+udEgbp3BeijYjHpRUy+pXv8qmZGRAd5ctekXeQflzS7kBIWh34Gmnow8ysdMoXo3Mf7vgycAywKfAEMDbtm5mVTlSi4a0nkjaX9CdJ8yQ9KOnr6fhJkp6UNCdtH67XprxndywGPpdnHWZmLdO6seYO4FsRMVvSmsA9km5Kr/00Ik5rtKBcgrSkn9PDquAR4QX6zKx8WjTcERELyW7mIyJeljSPbEShaXkNd8wC7gGGATuS3R7+MNlwhy8cmlkpRUc0vEmaKGlWzTaxqzIljQF2AGamQ8dKul/SBZLWqdemXBairWncn4D9ImJ52l8NuLHRZbW8EK11xQvRWldasRDtcwd+oOGYs+606XXrkzQSmA78e0RcJWkjYDHZSMO/AaMj4qieysj7wuEmwJo1+yPTMTOz8qk0sdWROqVXApdExFUAEfH3iOiMiApwLvC+euXkPQXvVODe1KMG+ABwUs51mpn1Sqty+Su7OeR8YF5tQjlJo9N4NcCBwAP1ysp7dscvUwa8XdKhEyLi6TzrNDPrtdbNkx4HHAbMlTQnHfs+MEHSWLLhjgXA0fUKymt2x7siYr6kHdOhx9PjJpI2iYjZedRrZrYqWtWTjojbyFaiWtl1zZaVV0/6m8BEspVZVhbA3jnVa2bWa9HR7ha8WS5BOiImpseGZnGYmZVBgevLNizXMWlJw4CvAnuQ9aD/BzgnIl7Ls14zs94YcEEa+BXwMvDztD8BuAg4KOd6zcyaF6s81brl8g7S74yI7Wv2/yTpvpzrNDPrlTL2pPO+meVeSbtWdyTtAtyec51mZr0SFTW8FSXvnvQuwOGSHkv7bwHmSZoLRERsl3P9ZmYNq3QOvOGO/XMu38ysZQbccEdEPApsDuydnr8CDIqIR9O+mVlpDLjhDkknAjsD7wR+CawOXEx2y6SZWankmBS01/Ie7jiQLI/qbICIeCqtUmBmVjpF9pAblXeQXhYRISkAJI3IuT4zs17rUxcOW7QE1mWSJgFrS/oScBRZDlUzs9Lpaz3pWataeEScJmlf4CWycel/jYib6pxmZtYW0ZfuOIyIKa2oIAXlLgOzpDsiYrdW1GNmtqrKOAWv7pi0pA2A44GtyRaWBSAiWpFudFj9t5iZFaNSwp50I/OkLwHmAVsCJ5OtJnB3i+ov4YQXMxuoItTwVpRGgvR6EXE+sDwipqeVbXetd5KZWV9T6VTDW1EamYK3PD0ulPQR4ClgsxbVX75/W5jZgNXXZndU/VDSWsC3yPJCjwL+qUX1H9aicszMVlkZx6TrBumIuDY9fRFoaDksSS/T9XizsiJjVCq77nLmZmZF6VNT8Kok/ZIuAm4am+5SRPjWbzPrc1qVu0PS5mQrU20MVIDJEXGGpHWBS4ExZJMwDo6I53sqq5Hhjmtrng8jy8fxVJMN3pAVp+891sPbzczaooXDHR3AtyJidspXdI+km4AjgVsi4lRJJwAnkE1x7lYjwx1X1u5Lmgrc3EgrJX0cOB3YBFgEbEE2nW+bRs43MytSpUUXDiNiIbAwPX9Z0jxgU+ATwPj0tinAraxqkO7C28lWWGnEv5FN17s5InaQtBfZYrQN2XfsxF40z/q7SRs2dGnErGl5XDiUNIYsG+hMYKMUwImIhWmUoUeNjEmvfBHwaepE/hrLI+JZSYMkDYqIP0n6UYPnmpkVqpkLh5ImArU9yckRMXml94wErgS+EREvSc1/CTQy3LEqFwFfSI2cAVwiaRHZWI2ZWek005NOAXlyd69LWo0sQF8SEVelw3+XNDr1okeTDQP3qO4dh5JuaeRYNz4BvEo2r/oG4K/Axxo818ysUNHE1hNlXebzgXkR8ZOal34HHJGeHwFcXa9NPeWTHgasAawvaR3euDtwFNmFwHqNHAxcHRH7kE1BaUlWPTOzvHRWWrbs6ziym/XmSpqTjn0fOJUsz/4XgMeAg+oV1NNwx9HAN8gC8j28EaRfAs6qV3BEdEp6VdJaEfFivfebmbVbqzKVRsRtdJ/24oPNlNVTPukzgDMkHRcRP2+m0BqvkX2T3ES2Uni17EZWdTEzK1SUMJ1QI1PwKpLWjogXANLQx4SI+EUD5/4+bWZmpVcpYfLkRoL0lyLi9eGNiHg+rVdYN0i3anUXM7MiVPpoT3qQJEVEdcXvwcDqPZ0g6bKIOFjSXLrO+7Fdr1prZpajvjrc8Qeyq5HnkAXcLwPX1znn6+nxl8BdwOO9bqGZWUE6+2iQPp7srpqvkF2tvBcY3dMJ1dsegTWBScBzwG+AKyLi771urZlZjkq4Dm39m1kiogLcCTwC7Ew2fWReI4VHxMkRsQ1wDNlUvumSGkrOZGZWtEoTW1F6upnlHcBnyRIiPUuWA5WI6E12m0VkOT+eBeomFDEza4e+NiY9H/gf4GMR8RcASU0tmyXpK8AhwAbAFWQzRR7qZVvNzHJVwiUOewzSnybrSf9J0g1kY8rN/ghbkGV/mlP3nWZmbdanpuBFxDRgmqQRwCfJkiRtJOlsYFpE3Fiv8Ig4oWUtNTPLWWe7G9CFRi4cvhIRl0TER4HNgDlkS76YmfUrFanhrShNpXyKiOciYlJE7J1Xg8zM2qVVqUpbqTfLZ5mZ9UtlnCftIG1mlvS12R1mZgNKX70t3MxsQHBP2sysxDwmbWZWYiXM+e8gbWZW5eEOM7MSK+NwR8vWLzcz6+s61fhWj6QLJC2S9EDNsZMkPSlpTto+XK8cB2kzs6TF+aQvBPbv4vhPI2Js2q6rV4iHO8zMklYOd0TEDEljVrUc96TNzJKCcnccK+n+NByyTr03O0ibmSUVNb5JmihpVs02sYEqzga2AsYCC4HT653g4Q4zs6SjifdGxGRgcjPl1y7ELelc4Np657gnbWaW5D3cIWl0ze6BwAPdvbfKPWkzs6SVN7NImgqMB9aX9ARwIjBe0liyOL8AOLpeOQ7SZmZJi2d3TOji8PnNluMgbWaWOHeHmVmJVUoYph2kzcySMq4W7iBtZpaUMcGSg7SZWeJUpWZmJeYxaTOzEitfiHaQNjN7ncekzcxKrLOEfWkHaTOzxD1pM7MS84VDM7MSK1+IdpA2M3udhzvMzErMFw6tVz7zxU/zkQkHQASPzP8bP/rWj1m2dHm7m2UF2/O0L7H5PmN5bfFLXLXP914/vvXn9+XdR+5HdHTy+B/ncPe//6aNrezbyjgm7ZVZSm79jdfj00d9kqM/8lU+v8+XGDR4MHt/fK92N8va4OHLZ/CHQ3+8wrHRu7+bt+y3E9P2/R5XffAE5p5zXZta1z8UtBBtU9yT7gMGDxnM0GFD6VzewbDhQ1n892fb3SRrg6dn/h8jN1t/hWPvOmwf7j/rGirLstX5Xnv2pXY0rd8oY0869yAtaUtgYUS8lvaHAxtFxIK86+4PFj/9LJdOupzLZv6apa8t5e4Z9zBrxj3tbpaVxFpv3ZiNdnknOx1/EJ1Ll3PXv01l8X2PtLtZfVYZLxwWMdxxOSv+7J3pmDVg5FojGbff7nx2t0P59E6HMHz4MPb91Afb3SwriUGDBzF0rRFc87GTuOuHU9n77GPb3aQ+LZr4ryhFBOkhEbGsupOer97dmyVNlDRL0qynXnmygOaV20577MjCx5/mxedepLOjkxnX38Y2O23T7mZZSbzy9PMsuH4WAIvnPEJUgmHrrtnmVvVdnUTDW1GKCNLPSPp4dUfSJ4DF3b05IiZHxM4RsfMmIzYtoHnltuipRWy9w7sZOmwoADvusQOP/uWxNrfKyuLRG2axybitARi15cYMWn0Irz33cptb1XdVmtiKUsSFwy8Dl0g6ExDwOHB4AfX2C/Punc/062Zw7g1n09nRycMP/oVrL/l9u5tlbTD+zGMYvdu7GbbuSD57938z+/Qr+fOl09nz9Il86ub/pHN5JzO+MandzezTKtG6HrKkC4CPAosiYtt0bF3gUmAMsAA4OCKe77GcaGGjeqxIGpnqa/hrfvxm+5TvUqu13WFs3O4mWAl94YmLV3ldlUO3+FTDMefiR6/qsT5J7weWAL+qCdL/BTwXEadKOgFYJyKO76mc3HrSkg6NiIslfXOl4wBExE/yqtvMrDdaOQUvImZIGrPS4U8A49PzKcCtQHuCNDAiPfoqhpn1CQXM2tgoIhYCRMRCSRvWOyG3IB0Rk9LjyXnVYWbWSh1NBGlJE4GJNYcmR8TkVrepqJtZjiMbKH+9voj4eHfnmJm1QzM96RSQmw3Kf5c0OvWiRwOL6p1QxOyO3wLnA9dQzht6zMyAQgLU74AjgFPT49X1TigiSL8WEf9dQD1mZquklbPdJE0lu0i4vqQngBPJgvNlkr4APAYcVK+cIoL0GZJOBG4EllYPRsTsAuo2M2tYi2d3TOjmpabyOhQRpN8DHAbszRv/moi0b2ZWGgM16f+BwFtr83eYmZXRgExVCtwHrE0DVzHNzNqpqDuwm1FEkN4ImC/pblYck/YUPDMrlTJOPysiSJ9YQB1mZqusyDzRjco9SEfEdElbAG+PiJslrQEMzrteM7NmDcgxaUlfIrt1cl1gK2BT4ByanIZiZpa3zijfgEcRSf+PAcYBLwFExMNA3aQiZmZFK+PyWUWMSS+NiGXVFKWShlDsiuhmZg1pZdL/VimiJz1d0veB4ZL2JVuE9poC6jUza0o0sRWliCB9AvAMMBc4GrgO+EEB9ZqZNaVCNLwVpYjZHRXg3LSZmZXWgJrdIWkuPfyrICK2y6tuM7PeKOPsjjx70h9Nj8ekx4vS4+eAV3Os18ysVwbUzSwR8SiApHERMa7mpRMk3Q6cklfdZma9UcbcHUVcOBwhaY/qjqTdeWORWjOz0hiQFw6BLwAXSFor7b8AHFVAvWZmTSljT7qI2R33ANtLGgUoIl7Mu04zs97oLGEevDxndxwaERdL+uZKxwGIiJ/kVbeZWW+U8Y7DPHvS1XHnNXOsw8ysZQba7I5J6fHkvOowM2ulMvakc5/dIemtkq6R9IykRZKulvTWvOs1M2vWQM2C92vgLLIFaQE+C0wFdimgbjOzhrWyJy1pAfAy0Al0RMTOvSmniCCtiLioZv9iSccWUK+ZWVNyuC18r4hYvCoFFBGk/yTpBOA3ZLk8DgF+L2ldgIh4roA2mJnVNaAuHNY4JD0evdLxo8iCtsenzawUoometKSJZEsDVk2OiMm1xQE3Sgpg0kqvNayIm1m2zLsOM7NWaOZ27xR0ewq84yLiKUkbAjdJmh8RM5ptUxGzO9aQ9ANJk9P+2yV9tN55ZmZFi4iGtwbKeio9LgKmAe/rTZuKSLD0S2AZsHvafwL4YQH1mpk1pTMqDW89kTRC0prV58B+wAO9aVMRY9JbRcQhkiYARMQ/VL033MysRFo4BW8jYFoKdUOAX0fEDb0pqIggvUzScNIqLZK2ApYWUK+ZWVNaNbsjIh4Btm9FWUUE6ROBG4DNJV0CjAOOLKBeM7OmDNRUpTdJmg3sCgj4eu3kbknbRMSDebfDzKyeAbUQba2IeBb4fTcvXwTsWEQ7zMx6MiB70g3wRUQzK4XOygBK+t+E8n11mdmANGCHO8zM+gIPd3RtWbsbYGYG5Uz6n+cahz1eDIyI2elx17zaYGbWjIGWBe/0Hl4LYO8c6zYza9qA6klHxF55lW1mlodK65P+r7JCxqQlbQtsDQyrHouIXxVRt5lZowbkhUNJJwLjyYL0dcABwG2Ag7SZlUoZg7TybpSkuWSJRu6NiO0lbQScFxEfy7XifkbSxN6u7GD9lz8X/V8R+aT/EdmaNB2SRgGL8JJZvTGx/ltsAPLnop8rYkx6lqS1gXOBe4AlwF0F1Gtm1uflOtyRkvtvFhGPp/0xwKiIuD+3SvspSbMiYud2t8PKxZ+L/i/X4Y7IvgF+W7O/wAG61zzuaF3x56KfK+LC4VnAhRFxd64VmZn1Q0UE6YeAdwCPAq+QpSaNiNgu14rNzPqBIoL0Fl0dj4hHc63YzKwfyDPB0qiIeAl4Oa86zMz6uzwvHP46Pd4DzEqP99Ts9zmSNpF0RS/OW9Lidpwnaesujh8p6cw867BM+l1v0stzr0vTUps551ZJLZvFIenLkg7v4vgYSQ/kWYc1J88ESx9Nj1vmVUfRIuIp4DMrH5c0JCI6CmzHF/tDHX3ckcADwFPNnhgRH175WJquqnTjV+4i4pz+UMdAkPsdh5KuljRB0hp519VKkn4k6as1+ydJ+la1l5F6UpdLuga4UdJISbdImi1prqRPNFHXdyTdLel+SSenY2MkzZc0JR2/ovo7rO1VSfq8pD9Lmg6MqylzA0lXpnLvljSu5ueYIulGSQskfUrSf6U23yBptS7q2D/9XPdJumVVf7dllH7f8ySdK+nB9PsZLmmspDvT32CapHUkfQbYGbhE0hxJw7so7wBJl9Xsj0+fFdLvff2aOn8BzAY2l3S2pFmpDSc30f79JN2R/k6XSxpZU9ePJN2Vtrel4ydJ+nZ6vlP6294BHFNT5mBJP675bB5d87NMl3RZ+uydKulzqfy5krbqoo63Sbo51TO7+h5rQETkugEfAH5BNrvjcrKe6LC8621Bu3cAptfsPwS8H3gg7R8JPAGsm/aHkN2oA7A+8BfeuDC7pId69iOb6yqyL81rUz1jyPJuj0vvuwD4dnp+K1mQGA08BmwArA7cDpyZ3vNrYI/0/C3AvPT8JLIEV6uR5VR5FTggvTYN+ORKdWwAPA5smY6v2+6/TU5/7zFABzA27V8GHArcD3wgHTsF+Fnt76eH8oakv82ItH82cGh6viB9RsYAFWDXmvOqn6fBqY7t6tWXyppRU9fxwL/W1PXP6fnhwLU1n4Pq56n2Z/wxb3zGJwI/SM+Hkg1TbkmWMO2F9PkbCjwJnJze9/Wa31FtHTOBA9PzYcAa7f6b95Ut9550REyPiK+S5euYDBxMlr+j1CLiXmBDZePQ2wPPk/1PV+umiHguPRfwH5LuB24GNgU2aqCq/dJ2L1lv6l3A29Nrj0fE7en5xcAeK527C3BrRDwTEcuAS2te2wc4U9Ic4HfAKElrpteuj4jlwFyyYHBDOj6XLHDU2hWYERF/A6j5efujv0XEnPT8HmArYO2ImJ6OTSH7Aq0rsuGvG4CPSRoCfAS4uou3PhoRd9bsHyxpNtnnYRuy7JH17Jred3v6ex8B1M6qmlrzuFvtiZLWYsWf8aKal/cDDk9lzgTW443P5t0RsTAilgJ/BW5Mx9/0GUqfu00jYhpARLwWEa828HMZxeWTHg58DDgE2JHsw94XXEHW898Y+E0Xr79S8/xzZL3OnSJiuaQF1OTP7oGA/4yISSsczG6hX3l+ZFfzJbubQzkI2C0i/rFSuQBLASKiIml5pO4NWa9u5c+Eeqijv1la87wTaOriXhcuJRs+eI4sqHU10+n1z5CkLYFvA++NiOclXUjjn6GbImJCN69HN8+r53b39xVwXET8YYWD0nhW/F1Vava7+wxZLxUxJn0pMI9suayzgK0i4ri8622R3wCfJQvU9WZ1rAUsSgF6L1bsyfTkD8BRNWOIm0raML32FknVns8EsmGKWjOB8ZLWS2PJB9W8diNwbHVH0tgG27OyO4APpACCpHV7WU5f9CLwvKQ90/5hQLXH+TKwZpdnveFWsk7Jl1jxXzndGUUWtF9UltL3gAbbeScwrma8eQ1J76h5/ZCaxztqT4yIF1J91X+lfa7m5T8AX6m5TvEOSSMabFNtHS8BT0j6ZCpnqPrYNap2KqIn/Uvg/0VEZ1cvSto3Im4qoB1Ni4gH0z/VnoyIhal3251LgGskzQLmAPMbrONGSe8G7ki93CVkY6GdZF9uR0iaBDxMNq5Ze+5CSSeR/Y+3kGy4ZHB6+WvAWWn4ZQjZmOWXG2nTSnU8I2kicJWkQWRDVfs2W04fdgRwTgoqjwCfT8cvTMf/QRf/YgGIiE5J15JdvziiXkURcZ+ke4EHU1231zmlet4zko4Epkoamg7/APhzej5U0kyyTllXve3PAxdIepUsMFedRzZ0MVvZh/MZ4JONtKkLhwGTJJ0CLCfrUDzSy7IGlNzvOKzbAGl2RPS4svhAlL4Qro2IbdvcFOvD0rDbzhGxuN1tsd4pIul/PR6vMjPrhnvSBZH0Hla8cg6wNCJ2aUd7rHUkTSObmlbr+JUvuLWorplk095qHRYRc1tdl5WDg7SZWYkVMbtj5W/9lY8tyLsNZmZ9VRFj0nf0dCwiPlVAG8zM+qTcgrSkjSXtBAyXtIOkHdM2HvAcyQFAUqey3BYPpHwSvf67S7pQWc6Muhn6Um6J3XtRxwJJ6/6znMEAAAKjSURBVPe2jWZ5yHOe9IfI5oduBvyk5vjLwPdzrNfK4x8RMRZA0iVk87Rf/yxIGtzd/PmeRP0MfePJ5pv/b7Nlm5VNnqlKpwBTJH06Iq7Mqx7rM/4H2C79S+pEsptvxqZZL6eSBdahwFkRMSndPPFzsjtV/0bNVE1Jt5Il7pklaX/gP8hu4lkMfIHsy6BT0qHAcWQ3Fp1DlmgK4BsRcbuk9cjyWWwA3IWng1oJFXHH4S2SfsIbiWmmA6dExIsF1G0lkBIMHcAbiZzeB2wbEX9LdzO+GBHvTReUb5d0I1kWwncC7yFLVPUQWSbA2nI3AM4F3p/KWjcinpN0DlnmwdPS+34N/DQibpP0FrK76t5N9mVxW0ScIukjZFnfzEqliCB9Plly9IPT/mFkt4r7gmH/NzxlUIOsJ30+sDtwVzWrHlmmte2q481kOVDeTvalPjUNhzwl6Y9dlN9ohr59gK3TbffwRkbA95M+hxHxe0nP9/LnNMtNEUF6q4j4dM3+yTX/41r/9vqYdFUKlLXZA7vLtPZh6mffazRDX08ZAQdKhj/ro4qYgvePmgxbKFsh5E3JaGzA6i7T2gzgs8pWBxkN7NXFud1l6Fs5Q113GQFnkLK+SToAWKdlP5VZixTRk/4K2QXEtdL+8zSQEcwGjO4yrU0ju2g4lyyb2/SVT+whQ981wBXKljA7ju4zAp5Mljludip/5UUdzNou99vC08Wgz5BWuSDL0RsRcUquFZuZ9QNF9KSvJlsPbTbZWmhmZtagInrSDzgnsplZ7xRx4fB/0w0LZmbWpCJ60g8BbyO7a2wpadpURGyXa8VmZv1AEUG6ywVZI+LRXCs2M+sH2p7038zMuleGNQ7NzKwbDtJmZiXmIG1mVmIO0mZmJeYgbWZWYv8fSR491ViFNI0AAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from sklearn.metrics import confusion_matrix\n", "import seaborn as sns\n", "\n", "conf_mat = confusion_matrix(y_test, y_pred)\n", "sns.heatmap(conf_mat, annot=True, fmt='d',\n", " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", "plt.ylabel('Actual')\n", "plt.xlabel('Predicted')\n", "#run chi squared" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'fasle' predicted as 'true' : 4 examples.\n" ] }, { "data": { "text/plain": [ "157 this brings the focus back to the complexity o...\n", "8 pandemics created by novel viruses from animal...\n", "123 several factors of emergence have been identif...\n", "173 ironically covid has achieved what was the u...\n", "Name: Text, dtype: object" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "'true' predicted as 'fasle' : 9 examples.\n" ] }, { "data": { "text/plain": [ "71 one key factor that appears to be crucial in l...\n", "169 some brief explanations of the basic concepts ...\n", "153 the control of influenza a viruses in swine i...\n", "33 siamon gordon frs is an emeritus glaxo wellc...\n", "26 to promote awareness of the existing or emergi...\n", "139 equine influenza is a common highly contagiou...\n", "24 viral infections remain a global threat to wor...\n", "176 beginning in late 2014 and into 2015 there we...\n", "86 yu nong gong rei lin kuo guang wu chen and ...\n", "Name: Text, dtype: object" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from IPython.display import display\n", "\n", "for predicted in category_id_df.category_id:\n", " for actual in category_id_df.category_id:\n", " if predicted != actual and conf_mat[actual, predicted] >= 2:\n", " print(\"'{}' predicted as '{}' : {} examples.\".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))\n", " display(train_data.loc[indices_test[(y_test == actual) & (y_pred == predicted)]]['Text'])\n", " print('')" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier()" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model2 = RandomForestClassifier()\n", "model2.fit(features, labels)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "TensorFlow-GPU-1.13", "language": "python", "name": "tf-gpu" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }