Commit 3b43d4e2 authored by Chanelle Lee's avatar Chanelle Lee
Browse files

Added some more plots to analysis

parent 5360041c
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
......@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
......@@ -39,7 +39,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
......@@ -56,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
......@@ -66,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
......@@ -79,7 +79,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
......@@ -91,16 +91,16 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x2140dc6d9b0>]"
"[<matplotlib.lines.Line2D at 0x97bb1d0>]"
]
},
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
......@@ -123,16 +123,16 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x2140dd105f8>]"
"[<matplotlib.lines.Line2D at 0x9851c88>]"
]
},
"execution_count": 10,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
},
......@@ -157,7 +157,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 9,
"metadata": {},
"outputs": [
{
......@@ -182,7 +182,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 10,
"metadata": {},
"outputs": [
{
......@@ -207,7 +207,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 11,
"metadata": {},
"outputs": [
{
......@@ -232,7 +232,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
......@@ -241,7 +241,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
......@@ -250,7 +250,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 14,
"metadata": {},
"outputs": [
{
......@@ -335,7 +335,7 @@
"4 2.002012e-02 4.075678e-09 6.984412e-32 7.149700e-70 3.085665e-124"
]
},
"execution_count": 17,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
......@@ -353,7 +353,7 @@
},
{
"cell_type": "code",
"execution_count": 56,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
......@@ -369,7 +369,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 16,
"metadata": {},
"outputs": [
{
......@@ -454,7 +454,7 @@
"78 0.390390 0.381372 0.273002 0.182563 0.113616"
]
},
"execution_count": 18,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
......@@ -484,7 +484,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 17,
"metadata": {},
"outputs": [
{
......@@ -569,7 +569,7 @@
"60 0.300300 0.347365 0.216260 0.119531 0.058234"
]
},
"execution_count": 19,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
......@@ -599,7 +599,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 18,
"metadata": {},
"outputs": [
{
......@@ -684,7 +684,7 @@
"41 0.205205 0.282880 0.125357 0.042451 0.010803"
]
},
"execution_count": 20,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
......@@ -714,7 +714,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 19,
"metadata": {},
"outputs": [
{
......@@ -799,7 +799,7 @@
"23 0.115115 0.152973 0.020303 0.001066 2.110231e-05"
]
},
"execution_count": 21,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
......@@ -822,16 +822,16 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x2140ed8b860>"
"<matplotlib.axes._subplots.AxesSubplot at 0x5394898>"
]
},
"execution_count": 22,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
},
......@@ -854,16 +854,16 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.lines.Line2D at 0x2140f13ad30>"
"<matplotlib.lines.Line2D at 0x9984320>"
]
},
"execution_count": 23,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
},
......@@ -893,7 +893,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
......@@ -907,7 +907,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 23,
"metadata": {},
"outputs": [
{
......@@ -983,7 +983,7 @@
"19 0.095095 0.107618 0.006595 0.000100 3.576083e-07"
]
},
"execution_count": 38,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
......@@ -994,16 +994,31 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 24,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\cl15753\\AppData\\Local\\Continuum\\anaconda2\\envs\\Py3\\lib\\site-packages\\pandas\\core\\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n",
" sort=sort)\n"
]
}
],
"source": [
"testParams_DF = testParams_DF.append([{'sigma': row[1], 'alpha': row[2], 'note': 'Prob_q2'} for row in df_1Ranges.itertuples()])"
"testParams_DF = testParams_DF.append([{'sigma': row[1], 'alpha': row[2], 'chance': '10%', 'note': 'Prob_q2'} for row in df_1Ranges.itertuples()])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 25,
"metadata": {},
"outputs": [
{
......@@ -1015,7 +1030,7 @@
" 0.028578442773621412]"
]
},
"execution_count": 58,
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
......@@ -1027,11 +1042,11 @@
},
{
"cell_type": "code",
"execution_count": 59,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"testParams_DF = testParams_DF.append([{'sigma': row[1], 'alpha': alphas_1[i], 'note': 'mean'}\n",
"testParams_DF = testParams_DF.append([{'sigma': row[1], 'alpha': alphas_1[i], 'chance': '10%', 'note': 'mean'}\n",
" for i, row in enumerate(df_1Ranges.itertuples())])"
]
},
......@@ -1044,7 +1059,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
......@@ -1058,7 +1073,7 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 28,
"metadata": {},
"outputs": [
{
......@@ -1134,7 +1149,7 @@
"35 0.175175 0.250550 0.089228 0.021781 0.003561"
]
},
"execution_count": 61,
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
......@@ -1145,16 +1160,16 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.lines.Line2D at 0x2140f873a20>"
"<matplotlib.lines.Line2D at 0xb114470>"
]
},
"execution_count": 62,
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
},
......@@ -1184,16 +1199,16 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"testParams_DF = testParams_DF.append([{'sigma': row[1], 'alpha': row[2], 'note': 'Prob_q2'} for row in df_25Ranges.itertuples()])"
"testParams_DF = testParams_DF.append([{'sigma': row[1], 'alpha': row[2], 'chance': '25%', 'note': 'Prob_q2'} for row in df_25Ranges.itertuples()])"
]
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
......@@ -1202,17 +1217,17 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"testParams_DF = testParams_DF.append([{'sigma': row[1], 'alpha': alphas_25[i], 'note': 'mean'}\n",
"testParams_DF = testParams_DF.append([{'sigma': row[1], 'alpha': alphas_25[i], 'chance': '25%', 'note': 'mean'}\n",
" for i, row in enumerate(df_25Ranges.itertuples())])"
]
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 33,
"metadata": {
"scrolled": true
},
......@@ -1221,23 +1236,23 @@
"name": "stdout",
"output_type": "stream",
"text": [
" sigma alpha note\n",
"0 0.370370 0.375167 Prob_q2\n",
"1 0.280280 0.337069 Prob_q2\n",
"2 0.185185 0.262259 Prob_q2\n",
"3 0.095095 0.107618 Prob_q2\n",
"0 0.370370 0.227216 mean\n",
"1 0.280280 0.171782 mean\n",
"2 0.185185 0.099345 mean\n",
"3 0.095095 0.028578 mean\n",
"0 0.700701 0.433217 Prob_q2\n",
"1 0.525526 0.411280 Prob_q2\n",
"2 0.350350 0.368292 Prob_q2\n",
"3 0.175175 0.250550 Prob_q2\n",
"0 0.700701 0.339747 mean\n",
"1 0.525526 0.293395 mean\n",
"2 0.350350 0.216131 mean\n",
"3 0.175175 0.091280 mean\n"
" alpha chance note sigma\n",
"0 0.375167 10.0 Prob_q2 0.370370\n",
"1 0.337069 10.0 Prob_q2 0.280280\n",
"2 0.262259 10.0 Prob_q2 0.185185\n",
"3 0.107618 10.0 Prob_q2 0.095095\n",
"0 0.227216 10.0 mean 0.370370\n",
"1 0.171782 10.0 mean 0.280280\n",
"2 0.099345 10.0 mean 0.185185\n",
"3 0.028578 10.0 mean 0.095095\n",
"0 0.433217 25.0 Prob_q2 0.700701\n",
"1 0.411280 25.0 Prob_q2 0.525526\n",
"2 0.368292 25.0 Prob_q2 0.350350\n",
"3 0.250550 25.0 Prob_q2 0.175175\n",
"0 0.339747 25.0 mean 0.700701\n",
"1 0.293395 25.0 mean 0.525526\n",
"2 0.216131 25.0 mean 0.350350\n",
"3 0.091280 25.0 mean 0.175175\n"
]
}
],
......@@ -1247,31 +1262,31 @@
},
{
"cell_type": "code",
"execution_count": 101,
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sigma alpha note\n",
"paramSet \n",
"1 0.370370 0.375167 Prob_q2\n",
"2 0.280280 0.337069 Prob_q2\n",
"3 0.185185 0.262259 Prob_q2\n",
"4 0.095095 0.107618 Prob_q2\n",
"5 0.370370 0.227216 mean\n",
"6 0.280280 0.171782 mean\n",
"7 0.185185 0.099345 mean\n",
"8 0.095095 0.028578 mean\n",
"9 0.700701 0.433217 Prob_q2\n",
"10 0.525526 0.411280 Prob_q2\n",
"11 0.350350 0.368292 Prob_q2\n",
"12 0.175175 0.250550 Prob_q2\n",
"13 0.700701 0.339747 mean\n",
"14 0.525526 0.293395 mean\n",
"15 0.350350 0.216131 mean\n",
"16 0.175175 0.091280 mean\n"
" alpha chance note sigma\n",
"paramSet \n",
"1 0.375167 10.0 Prob_q2 0.370370\n",
"2 0.337069 10.0 Prob_q2 0.280280\n",
"3 0.262259 10.0 Prob_q2 0.185185\n",
"4 0.107618 10.0 Prob_q2 0.095095\n",
"5 0.227216 10.0 mean 0.370370\n",
"6 0.171782 10.0 mean 0.280280\n",
"7 0.099345 10.0 mean 0.185185\n",
"8 0.028578 10.0 mean 0.095095\n",
"9 0.433217 25.0 Prob_q2 0.700701\n",
"10 0.411280 25.0 Prob_q2 0.525526\n",
"11 0.368292 25.0 Prob_q2 0.350350\n",
"12 0.250550 25.0 Prob_q2 0.175175\n",
"13 0.339747 25.0 mean 0.700701\n",
"14 0.293395 25.0 mean 0.525526\n",
"15 0.216131 25.0 mean 0.350350\n",
"16 0.091280 25.0 mean 0.175175\n"
]
}
],
......@@ -1283,7 +1298,7 @@
},
{
"cell_type": "code",
"execution_count": 102,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
......@@ -1292,7 +1307,7 @@
},
{
"cell_type": "code",
"execution_count": 103,
"execution_count": 36,
"metadata": {},
"outputs": [
{
......@@ -1301,7 +1316,7 @@
"0.37037046296296294"
]
},
"execution_count": 103,
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
......@@ -1312,7 +1327,7 @@
},
{
"cell_type": "code",
"execution_count": 104,
"execution_count": 37,
"metadata": {},
"outputs": [
{
......@@ -1321,7 +1336,7 @@
"{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}"
]
},
"execution_count": 104,
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
......@@ -1354,7 +1369,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
"version": "3.7.1"
}
},
"nbformat": 4,
......
%% Cell type:code id: tags:
``` python
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
```
%% Cell type:markdown id: tags:
# Analysis of the probability of confusing qualities for different values of sigma
%% Cell type:markdown id: tags:
First need the pdf and cdf of the normal distributions which will represent the noise around each of the quality values
%% Cell type:code id: tags:
``` python
def f(x, mu, s):
return scipy.stats.norm.pdf(x, loc=mu, scale=s)
```
%% Cell type:code id: tags:
``` python
def F(x, mu, s):
return scipy.stats.norm.cdf(x, loc=mu, scale=s)
```
%% Cell type:markdown id: tags:
Qualities are set in the interval (0,1) and equally spaced
%% Cell type:code id: tags:
``` python
n = 5
qualities = {i: i/(n+1) for i in range(1, (n+1))}
```
%% Cell type:code id: tags:
``` python
def integrand(x, i, j, s):
return f(x, qualities[i], s)*F(x, qualities[j], s)
def probConfusion(i, j, s):
return scipy.integrate.quad(integrand, -np.inf, np.inf, args=(i, j, s))
```
%% Cell type:code id: tags:
``` python
s0 = 0.0000001
sf = 100
ss = np.linspace(s0, sf, 1000)
confusions = [probConfusion(1, 2, s)[0] for s in ss]
```
%% Cell type:code id: tags:
``` python
plt.plot(ss, confusions)
```
%%%% Output: execute_result
[<matplotlib.lines.Line2D at 0x2140dc6d9b0>]
[<matplotlib.lines.Line2D at 0x97bb1d0>]
%%%% Output: display_data
[Hidden Image Output]
%% Cell type:code id: tags:
``` python
ss = np.linspace(0.0000001, 10, 1000)
confusions = [probConfusion(1, 2, s)[0] for s in ss]
plt.plot(ss, confusions)
```
%%%% Output: execute_result
[<matplotlib.lines.Line2D at 0x2140dd105f8>]
[<matplotlib.lines.Line2D at 0x9851c88>]
%%%% Output: display_data
[Hidden Image Output]
%% Cell type:code id: tags:
``` python
ss = np.linspace(0.0000001, 10, 1000)
confusions = [[probConfusion(1, j, s)[0] for s in ss] for j in [2, 3, 4, 5]]
for confusion in confusions:
plt.plot(ss, confusion)
```
%%%% Output: display_data
[Hidden Image Output]
%% Cell type:code id: tags:
``` python
ss = np.linspace(0.0000001, 10, 1000)
confusions = [[probConfusion(1, j, s)[0] for s in ss] for j in [2, 3, 4, 5]]
for confusion in confusions:
plt.plot(ss, confusion)
```
%%%% Output: display_data
[Hidden Image Output]
%% Cell type:code id: tags:
``` python
ss = np.linspace(0.0000001, 5, 1000)
confusions = [[probConfusion(1, j, s)[0] for s in ss] for j in [2, 3, 4, 5]]
for confusion in confusions:
plt.plot(ss, confusion)
```
%%%% Output: display_data
[Hidden Image Output]
%% Cell type:code id: tags:
``` python
d = {'sigma':ss, **{'q{}'.format(j): confusions[i] for i, j in enumerate([2, 3, 4, 5])}}
```
%% Cell type:code id: tags:
``` python
df = pd.DataFrame(data=d)
```
%% Cell type:code id: tags:
``` python
df.head()
```
%%%% Output: execute_result
sigma q2 q3 q4 q5
0 1.000000e-07 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
1 5.005105e-03 4.104751e-122 0.000000e+00 0.000000e+00 0.000000e+00
2 1.001011e-02 1.051201e-31 3.577518e-122 7.451264e-274 0.000000e+00
3 1.501511e-02 5.734691e-15 2.733803e-55 9.343211e-123 2.336542e-219
4 2.002012e-02 4.075678e-09 6.984412e-32 7.149700e-70 3.085665e-124
%% Cell type:markdown id: tags:
### Here want to get some idea of appropriate sigma values - how about checking the cut offs for having at least a 10% chance of confusion for all sites within a range:
%% Cell type:code id: tags:
``` python
testParams_DF = pd.DataFrame(columns=['sigma', 'alpha', 'note'])
```
%% Cell type:markdown id: tags:
Range is whole quality space
%% Cell type:code id: tags:
``` python
df.loc[(df['q2'] >= 0.1) & (df['q3'] >= 0.1) & (df['q4'] >= 0.1) & (df['q5'] >= 0.1)].head()
```
%%%% Output: execute_result
sigma q2 q3 q4 q5
74 0.370370 0.375167 0.262259 0.169892 0.101546
75 0.375375 0.376777 0.265031 0.173131 0.104590
76 0.380380 0.378347 0.267745 0.176322 0.107618
77 0.385385 0.379878 0.270401 0.179466 0.110627
78 0.390390 0.381372 0.273002 0.182563 0.113616
%% Cell type:markdown id: tags:
$\sigma = 0.37$
- q2 : 0.38
- q3 : 0.26
- q4 : 0.17
- q5 : 0.10
%% Cell type:markdown id: tags:
Range is q4 or closer
%% Cell type:code id: tags:
``` python
df.loc[(df['q2'] >= 0.1) & (df['q3'] >= 0.1) & (df['q4'] >= 0.1)].head()
```