Files committed to the personality detection project

2020-03-01 17:38:48 +11:00 · 2020-03-01 17:38:48 +11:00 · b7d348b735
parent 185f66c5db
commit b7d348b735
11 changed files with 386 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/PersonalityDetection/.DS_Store
+++ b/PersonalityDetection/.DS_Store
--- a/PersonalityDetection/demo/.ipynb_checkpoints/MBTI_demo-checkpoint.ipynb
+++ b/PersonalityDetection/demo/.ipynb_checkpoints/MBTI_demo-checkpoint.ipynb
@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Myers-Briggs Type Indicator demo\n",
+    "\n",
+    "Intructions: Execute the two cells below to load the functions and then enter text in space provided to estimate the MBTI personality\n",
+    "\n",
+    "**Note: This notebook requires SpaCy and IPython widgets to be installed**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "from spacy.lang.en.stop_words import STOP_WORDS\n",
+    "import re\n",
+    "import pickle\n",
+    "import numpy as np\n",
+    "from ipywidgets import widgets, interact\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "plt.style.use(\"ggplot\")\n",
+    "\n",
+    "# python -m spacy download en_core_web_sm\n",
+    "nlp = spacy.load(\"en_core_web_sm\")\n",
+    "\n",
+    "\n",
+    "def tokeniser(sentence):\n",
+    " \n",
+    "    # Remove ||| from kaggle dataset\n",
+    "    sentence = re.sub(\"[]|||[]\", \" \", sentence)\n",
+    "\n",
+    "    # remove reddit subreddit urls\n",
+    "    sentence = re.sub(\"/r/[0-9A-Za-z]\", \"\", sentence)\n",
+    "\n",
+    "    # remove MBTI types\n",
+    "    MBTI_types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',\n",
+    "              'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ',\n",
+    "              'MBTI']\n",
+    "    MBTI_types = [ti.lower() for ti in MBTI_types] + [ti.lower() + 's' for ti in MBTI_types]\n",
+    "\n",
+    "    tokens = nlp(sentence)\n",
+    "\n",
+    "    tokens = [ti for ti in tokens if ti.lower_ not in STOP_WORDS]\n",
+    "    tokens = [ti for ti in tokens if not ti.is_space]\n",
+    "    tokens = [ti for ti in tokens if not ti.is_punct]\n",
+    "    tokens = [ti for ti in tokens if not ti.like_num]\n",
+    "    tokens = [ti for ti in tokens if not ti.like_url]\n",
+    "    tokens = [ti for ti in tokens if not ti.like_email]\n",
+    "    tokens = [ti for ti in tokens if ti.lower_ not in MBTI_types]\n",
+    "\n",
+    "\n",
+    "    # lemmatize\n",
+    "    tokens = [ti.lemma_ for ti in tokens if ti.lemma_ not in STOP_WORDS]\n",
+    "    tokens = [ti for ti in tokens if len(ti) > 1]\n",
+    "\n",
+    "    return tokens\n",
+    "\n",
+    "dummy_fn = lambda x:x\n",
+    "\n",
+    "\n",
+    "with open('./pickle files/cv.pickle', 'rb') as f:\n",
+    "    cv = pickle.load(f)\n",
+    "    \n",
+    "with open('./pickle files/idf_transformer.pickle', 'rb') as f:\n",
+    "    idf_transformer = pickle.load(f)\n",
+    "    \n",
+    "# loading the pickle files with the classifiers\n",
+    "with open('./pickle files/LR_clf_IE_kaggle.pickle', 'rb') as f:\n",
+    "    lr_ie = pickle.load(f)\n",
+    "with open('./pickle files/LR_clf_JP_kaggle.pickle', 'rb') as f:\n",
+    "    lr_jp = pickle.load(f)\n",
+    "with open('./pickle files/LR_clf_NS_kaggle.pickle', 'rb') as f:\n",
+    "    lr_ns = pickle.load(f)\n",
+    "with open('./pickle files/LR_clf_TF_kaggle.pickle', 'rb') as f:\n",
+    "    lr_tf = pickle.load(f)\n",
+    "\n",
+    "\n",
+    "def eval_string(my_post, show_graph=False):\n",
+    "    c = cv.transform([tokeniser(my_post)])\n",
+    "    x = idf_transformer.transform(c)\n",
+    "    \n",
+    "    ie = lr_ie.predict_proba(x).flatten()\n",
+    "    ns = lr_ns.predict_proba(x).flatten()\n",
+    "    tf = lr_tf.predict_proba(x).flatten()\n",
+    "    jp = lr_jp.predict_proba(x).flatten()\n",
+    "    \n",
+    "    probs = np.vstack([ie, ns, tf, jp])\n",
+    "    \n",
+    "    names = [\"Introversion - Extroversion\", \n",
+    "             \"Intuiting - Sensing\", \n",
+    "             \"Thinking - Feeling\", \n",
+    "             \"Judging - Perceiving\"]\n",
+    "    \n",
+    "    for i, dim in enumerate(names):\n",
+    "        print(f\"{dim:28s}: {probs[i,1]:.3f} - {probs[i, 0]:.3f}\")\n",
+    "        \n",
+    "    if show_graph:\n",
+    "        fig = plt.figure(figsize=(6,6))\n",
+    "        ax = fig.gca()\n",
+    "        \n",
+    "        xlabels = [\"Introversion (I)\", \"Intuiting (N)\", \"Thinking (T)\", \"Judging (J)\"]\n",
+    "        ax.barh(xlabels, [1, 1, 1, 1])\n",
+    "        ax.barh(xlabels, [ie[1], ns[1], tf[1], jp[1]])\n",
+    "        \n",
+    "        ax.set_xlim([0, 1])\n",
+    "        ax.set_xlabel(\"Propensity\")\n",
+    "        \n",
+    "        plt.show(fig)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Type in some text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "942446e84f5f4ee0945435bd80296d2c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "interactive(children=(Textarea(value='', description='Input:', placeholder='Enter in some text'), Checkbox(val…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<function __main__.eval_string(my_post, show_graph=False)>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "interact(eval_string, my_post=widgets.Textarea( value='', \n",
+    "                                               placeholder='Enter in some text', \n",
+    "                                               description='Input:',\n",
+    "                                               disabled=False)\n",
+    "        )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/PersonalityDetection/demo/MBTI_demo.ipynb
+++ b/PersonalityDetection/demo/MBTI_demo.ipynb
@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Myers-Briggs Type Indicator demo\n",
+    "\n",
+    "Intructions: Execute the two cells below to load the functions and then enter text in space provided to estimate the MBTI personality\n",
+    "\n",
+    "**Note: This notebook requires SpaCy and IPython widgets to be installed**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "from spacy.lang.en.stop_words import STOP_WORDS\n",
+    "import re\n",
+    "import pickle\n",
+    "import numpy as np\n",
+    "from ipywidgets import widgets, interact\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "plt.style.use(\"ggplot\")\n",
+    "\n",
+    "# python -m spacy download en_core_web_sm\n",
+    "nlp = spacy.load(\"en_core_web_sm\")\n",
+    "\n",
+    "\n",
+    "def tokeniser(sentence):\n",
+    " \n",
+    "    # Remove ||| from kaggle dataset\n",
+    "    sentence = re.sub(\"[]|||[]\", \" \", sentence)\n",
+    "\n",
+    "    # remove reddit subreddit urls\n",
+    "    sentence = re.sub(\"/r/[0-9A-Za-z]\", \"\", sentence)\n",
+    "\n",
+    "    # remove MBTI types\n",
+    "    MBTI_types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',\n",
+    "              'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ',\n",
+    "              'MBTI']\n",
+    "    MBTI_types = [ti.lower() for ti in MBTI_types] + [ti.lower() + 's' for ti in MBTI_types]\n",
+    "\n",
+    "    tokens = nlp(sentence)\n",
+    "\n",
+    "    tokens = [ti for ti in tokens if ti.lower_ not in STOP_WORDS]\n",
+    "    tokens = [ti for ti in tokens if not ti.is_space]\n",
+    "    tokens = [ti for ti in tokens if not ti.is_punct]\n",
+    "    tokens = [ti for ti in tokens if not ti.like_num]\n",
+    "    tokens = [ti for ti in tokens if not ti.like_url]\n",
+    "    tokens = [ti for ti in tokens if not ti.like_email]\n",
+    "    tokens = [ti for ti in tokens if ti.lower_ not in MBTI_types]\n",
+    "\n",
+    "\n",
+    "    # lemmatize\n",
+    "    tokens = [ti.lemma_ for ti in tokens if ti.lemma_ not in STOP_WORDS]\n",
+    "    tokens = [ti for ti in tokens if len(ti) > 1]\n",
+    "\n",
+    "    return tokens\n",
+    "\n",
+    "dummy_fn = lambda x:x\n",
+    "\n",
+    "\n",
+    "with open('./pickle files/cv.pickle', 'rb') as f:\n",
+    "    cv = pickle.load(f)\n",
+    "    \n",
+    "with open('./pickle files/idf_transformer.pickle', 'rb') as f:\n",
+    "    idf_transformer = pickle.load(f)\n",
+    "    \n",
+    "# loading the pickle files with the classifiers\n",
+    "with open('./pickle files/LR_clf_IE_kaggle.pickle', 'rb') as f:\n",
+    "    lr_ie = pickle.load(f)\n",
+    "with open('./pickle files/LR_clf_JP_kaggle.pickle', 'rb') as f:\n",
+    "    lr_jp = pickle.load(f)\n",
+    "with open('./pickle files/LR_clf_NS_kaggle.pickle', 'rb') as f:\n",
+    "    lr_ns = pickle.load(f)\n",
+    "with open('./pickle files/LR_clf_TF_kaggle.pickle', 'rb') as f:\n",
+    "    lr_tf = pickle.load(f)\n",
+    "\n",
+    "\n",
+    "def eval_string(my_post, show_graph=False):\n",
+    "    c = cv.transform([tokeniser(my_post)])\n",
+    "    x = idf_transformer.transform(c)\n",
+    "    \n",
+    "    ie = lr_ie.predict_proba(x).flatten()\n",
+    "    ns = lr_ns.predict_proba(x).flatten()\n",
+    "    tf = lr_tf.predict_proba(x).flatten()\n",
+    "    jp = lr_jp.predict_proba(x).flatten()\n",
+    "    \n",
+    "    probs = np.vstack([ie, ns, tf, jp])\n",
+    "    \n",
+    "    names = [\"Introversion - Extroversion\", \n",
+    "             \"Intuiting - Sensing\", \n",
+    "             \"Thinking - Feeling\", \n",
+    "             \"Judging - Perceiving\"]\n",
+    "    \n",
+    "    for i, dim in enumerate(names):\n",
+    "        print(f\"{dim:28s}: {probs[i,1]:.3f} - {probs[i, 0]:.3f}\")\n",
+    "        \n",
+    "    if show_graph:\n",
+    "        fig = plt.figure(figsize=(6,6))\n",
+    "        ax = fig.gca()\n",
+    "        \n",
+    "        xlabels = [\"Introversion (I)\", \"Intuiting (N)\", \"Thinking (T)\", \"Judging (J)\"]\n",
+    "        ax.barh(xlabels, [1, 1, 1, 1])\n",
+    "        ax.barh(xlabels, [ie[1], ns[1], tf[1], jp[1]])\n",
+    "        \n",
+    "        ax.set_xlim([0, 1])\n",
+    "        ax.set_xlabel(\"Propensity\")\n",
+    "        \n",
+    "        plt.show(fig)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Type in some text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "942446e84f5f4ee0945435bd80296d2c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "interactive(children=(Textarea(value='', description='Input:', placeholder='Enter in some text'), Checkbox(val…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<function __main__.eval_string(my_post, show_graph=False)>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "interact(eval_string, my_post=widgets.Textarea( value='', \n",
+    "                                               placeholder='Enter in some text', \n",
+    "                                               description='Input:',\n",
+    "                                               disabled=False)\n",
+    "        )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/PersonalityDetection/demo/pickle
+++ b/PersonalityDetection/demo/pickle
--- a/PersonalityDetection/demo/pickle
+++ b/PersonalityDetection/demo/pickle
--- a/PersonalityDetection/demo/pickle
+++ b/PersonalityDetection/demo/pickle
--- a/PersonalityDetection/demo/pickle
+++ b/PersonalityDetection/demo/pickle
--- a/PersonalityDetection/demo/pickle
+++ b/PersonalityDetection/demo/pickle
--- a/PersonalityDetection/demo/pickle
+++ b/PersonalityDetection/demo/pickle
--- a/PersonalityDetection/demo/pickle
+++ b/PersonalityDetection/demo/pickle