import numpy as np
import pandas as pd


initials = {'a': 'a',
            'b': 'b',
            'c': 'c',
            'd': 'd'}

method_list = ['c', 'd', 'l', 'm', 'o', 'p', 's', 't']
level_list = ['0', '1', '2' , '3']
subdisc_list = ['act', 'dec', 'epi', 'lan', 'log', 'met', 'min', 'oth', 'sci', 'val']
family_list = ['logfam', 'probfam']
fam_dict = {'logfam': ['l', 'm', 's'],
            'probfam': ['c', 'd', 'p', 't']}


def remove_all_occurrences(list_obj, value):
    while value in list_obj:
        list_obj.remove(value)
        
        
# Reference: https://thispointer.com/python-remove-elements-from-list-by-value/#3


df = pd.read_csv('phil_studies_7y_july_foranalysis_master.csv',
                 header=0,
                 index_col=0)

df.head(2)


df = df.convert_dtypes()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 975 entries, CBNREUXB to VIT964GL
Data columns (total 54 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   author             975 non-null    string 
 1   title              975 non-null    string 
 2   year               975 non-null    Int64  
 3   vol                975 non-null    Int64  
 4   iss                975 non-null    string 
 5   pg_start           975 non-null    Int64  
 6   pg_end             975 non-null    Int64  
 7   doi                975 non-null    string 
 8   num_authors        975 non-null    Int64  
 9   discussion         975 non-null    Int64  
 10  screener1          211 non-null    Int64  
 11  screener2          330 non-null    Int64  
 12  screener3          735 non-null    Int64  
 13  screener4          443 non-null    Int64  
 14  screener5          73 non-null     Int64  
 15  screener6          264 non-null    Int64  
 16  screen_num         975 non-null    Int64  
 17  screen_ave         975 non-null    Float64
 18  screen_hits        975 non-null    Int64  
 19  class_assignments  245 non-null    string 
 20  method_a           122 non-null    string 
 21  level_a            122 non-null    Int64  
 22  subdisc_a          122 non-null    string 
 23  method_b           124 non-null    string 
 24  level_b            124 non-null    Int64  
 25  subdisc_b          124 non-null    string 
 26  method_c           122 non-null    string 
 27  level_c            122 non-null    Int64  
 28  subdisc_c          122 non-null    string 
 29  method_d           122 non-null    string 
 30  level_d            122 non-null    Int64  
 31  subdisc_d          122 non-null    string 
 32  resolver           207 non-null    string 
 33  method_resolved    245 non-null    string 
 34  level_resolved     245 non-null    Int64  
 35  subdisc_resolved   245 non-null    string 
 36  c                  245 non-null    Int64  
 37  d                  245 non-null    Int64  
 38  l                  245 non-null    Int64  
 39  m                  245 non-null    Int64  
 40  o                  245 non-null    Int64  
 41  p                  245 non-null    Int64  
 42  s                  245 non-null    Int64  
 43  t                  245 non-null    Int64  
 44  act                245 non-null    Int64  
 45  dec                245 non-null    Int64  
 46  epi                245 non-null    Int64  
 47  lan                245 non-null    Int64  
 48  log                245 non-null    Int64  
 49  met                245 non-null    Int64  
 50  min                245 non-null    Int64  
 51  oth                245 non-null    Int64  
 52  sci                245 non-null    Int64  
 53  val                245 non-null    Int64  
dtypes: Float64(1), Int64(37), string(16)
memory usage: 455.1+ KB


df7y_hits = df[df['screen_hits'] == 1].copy()
df7y_hits.head()


df7y_hits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, CBNREUXB to ATMYJ8IU
Data columns (total 54 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   author             245 non-null    string 
 1   title              245 non-null    string 
 2   year               245 non-null    Int64  
 3   vol                245 non-null    Int64  
 4   iss                245 non-null    string 
 5   pg_start           245 non-null    Int64  
 6   pg_end             245 non-null    Int64  
 7   doi                245 non-null    string 
 8   num_authors        245 non-null    Int64  
 9   discussion         245 non-null    Int64  
 10  screener1          56 non-null     Int64  
 11  screener2          78 non-null     Int64  
 12  screener3          193 non-null    Int64  
 13  screener4          109 non-null    Int64  
 14  screener5          25 non-null     Int64  
 15  screener6          61 non-null     Int64  
 16  screen_num         245 non-null    Int64  
 17  screen_ave         245 non-null    Float64
 18  screen_hits        245 non-null    Int64  
 19  class_assignments  245 non-null    string 
 20  method_a           122 non-null    string 
 21  level_a            122 non-null    Int64  
 22  subdisc_a          122 non-null    string 
 23  method_b           124 non-null    string 
 24  level_b            124 non-null    Int64  
 25  subdisc_b          124 non-null    string 
 26  method_c           122 non-null    string 
 27  level_c            122 non-null    Int64  
 28  subdisc_c          122 non-null    string 
 29  method_d           122 non-null    string 
 30  level_d            122 non-null    Int64  
 31  subdisc_d          122 non-null    string 
 32  resolver           207 non-null    string 
 33  method_resolved    245 non-null    string 
 34  level_resolved     245 non-null    Int64  
 35  subdisc_resolved   245 non-null    string 
 36  c                  245 non-null    Int64  
 37  d                  245 non-null    Int64  
 38  l                  245 non-null    Int64  
 39  m                  245 non-null    Int64  
 40  o                  245 non-null    Int64  
 41  p                  245 non-null    Int64  
 42  s                  245 non-null    Int64  
 43  t                  245 non-null    Int64  
 44  act                245 non-null    Int64  
 45  dec                245 non-null    Int64  
 46  epi                245 non-null    Int64  
 47  lan                245 non-null    Int64  
 48  log                245 non-null    Int64  
 49  met                245 non-null    Int64  
 50  min                245 non-null    Int64  
 51  oth                245 non-null    Int64  
 52  sci                245 non-null    Int64  
 53  val                245 non-null    Int64  
dtypes: Float64(1), Int64(37), string(16)
memory usage: 114.4+ KB


dict = {}
for type in method_list:
    for name in initials.values():
        # Create new indicator (1/0) column for method-type and classifier name, filling nulls with -1
        df7y_hits[f'{type}_{name}'] = df7y_hits[f'method_{name}'].str.contains(type).astype('Int64').fillna(-1)
    
    # Create dictionary entry of list with method_type as key
    column_names = [f'{type}_{name}' for name in initials.values()]
    dict[type] = df7y_hits[column_names].values.tolist()
    print(column_names)
    
    # Drop intermediary columns from DataFrame
    df7y_hits.drop(columns=column_names, inplace=True)
    
    # Remove nulls coded as -1, leaving the list a list of two-element lists
    for x in dict[type]:
        remove_all_occurrences(x, -1)
        
    # Assign the first and second elements from the two-element lists to two new method_type columns
    df7y_hits[f'{type}_1'] = [x[0] for x in dict[type]]
    df7y_hits[f'{type}_2'] = [x[1] for x in dict[type]]

['c_a', 'c_b', 'c_c', 'c_d']
['d_a', 'd_b', 'd_c', 'd_d']
['l_a', 'l_b', 'l_c', 'l_d']
['m_a', 'm_b', 'm_c', 'm_d']
['o_a', 'o_b', 'o_c', 'o_d']
['p_a', 'p_b', 'p_c', 'p_d']
['s_a', 's_b', 's_c', 's_d']
['t_a', 't_b', 't_c', 't_d']


#Check
methtype_cols = [name for name in df7y_hits.columns.tolist() 
                 if ('method' in name) and ('resolved' not in name)] + \
                [name for name in df7y_hits.columns.tolist() 
                      for type in method_list if name.startswith(f'{type}_')]
df7y_hits[methtype_cols].sample(10)


df7y_hits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, CBNREUXB to ATMYJ8IU
Data columns (total 70 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   author             245 non-null    string 
 1   title              245 non-null    string 
 2   year               245 non-null    Int64  
 3   vol                245 non-null    Int64  
 4   iss                245 non-null    string 
 5   pg_start           245 non-null    Int64  
 6   pg_end             245 non-null    Int64  
 7   doi                245 non-null    string 
 8   num_authors        245 non-null    Int64  
 9   discussion         245 non-null    Int64  
 10  screener1          56 non-null     Int64  
 11  screener2          78 non-null     Int64  
 12  screener3          193 non-null    Int64  
 13  screener4          109 non-null    Int64  
 14  screener5          25 non-null     Int64  
 15  screener6          61 non-null     Int64  
 16  screen_num         245 non-null    Int64  
 17  screen_ave         245 non-null    Float64
 18  screen_hits        245 non-null    Int64  
 19  class_assignments  245 non-null    string 
 20  method_a           122 non-null    string 
 21  level_a            122 non-null    Int64  
 22  subdisc_a          122 non-null    string 
 23  method_b           124 non-null    string 
 24  level_b            124 non-null    Int64  
 25  subdisc_b          124 non-null    string 
 26  method_c           122 non-null    string 
 27  level_c            122 non-null    Int64  
 28  subdisc_c          122 non-null    string 
 29  method_d           122 non-null    string 
 30  level_d            122 non-null    Int64  
 31  subdisc_d          122 non-null    string 
 32  resolver           207 non-null    string 
 33  method_resolved    245 non-null    string 
 34  level_resolved     245 non-null    Int64  
 35  subdisc_resolved   245 non-null    string 
 36  c                  245 non-null    Int64  
 37  d                  245 non-null    Int64  
 38  l                  245 non-null    Int64  
 39  m                  245 non-null    Int64  
 40  o                  245 non-null    Int64  
 41  p                  245 non-null    Int64  
 42  s                  245 non-null    Int64  
 43  t                  245 non-null    Int64  
 44  act                245 non-null    Int64  
 45  dec                245 non-null    Int64  
 46  epi                245 non-null    Int64  
 47  lan                245 non-null    Int64  
 48  log                245 non-null    Int64  
 49  met                245 non-null    Int64  
 50  min                245 non-null    Int64  
 51  oth                245 non-null    Int64  
 52  sci                245 non-null    Int64  
 53  val                245 non-null    Int64  
 54  c_1                245 non-null    int64  
 55  c_2                245 non-null    int64  
 56  d_1                245 non-null    int64  
 57  d_2                245 non-null    int64  
 58  l_1                245 non-null    int64  
 59  l_2                245 non-null    int64  
 60  m_1                245 non-null    int64  
 61  m_2                245 non-null    int64  
 62  o_1                245 non-null    int64  
 63  o_2                245 non-null    int64  
 64  p_1                245 non-null    int64  
 65  p_2                245 non-null    int64  
 66  s_1                245 non-null    int64  
 67  s_2                245 non-null    int64  
 68  t_1                245 non-null    int64  
 69  t_2                245 non-null    int64  
dtypes: Float64(1), Int64(37), int64(16), string(16)
memory usage: 145.0+ KB


df7y_hits[[name for name in df7y_hits.columns if 'level' in name]].sample(10)


dict = {}
for type in ['level']:
    
    # Create dictionary entry of list with method_type as key
    column_names = [f'{type}_{name}' for name in initials.values()]
    dict[type] = df7y_hits[column_names].fillna(-1).values.tolist()
    print(column_names)
    
    # Remove nulls coded as -1, leaving the list a list of two-element lists
    for x in dict[type]:
        remove_all_occurrences(x, -1)
        
    # Assign the first and second elements from the two-element lists to two new method_type columns
    df7y_hits[f'{type}_1'] = [x[0] for x in dict[type]]
    df7y_hits[f'{type}_2'] = [x[1] for x in dict[type]]

['level_a', 'level_b', 'level_c', 'level_d']


#Check
level_cols = [name for name in df7y_hits.columns.tolist() 
                 if ('level' in name) and ('resolved' not in name)]
df7y_hits[level_cols].sample(10)


df7y_hits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, CBNREUXB to ATMYJ8IU
Data columns (total 72 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   author             245 non-null    string 
 1   title              245 non-null    string 
 2   year               245 non-null    Int64  
 3   vol                245 non-null    Int64  
 4   iss                245 non-null    string 
 5   pg_start           245 non-null    Int64  
 6   pg_end             245 non-null    Int64  
 7   doi                245 non-null    string 
 8   num_authors        245 non-null    Int64  
 9   discussion         245 non-null    Int64  
 10  screener1          56 non-null     Int64  
 11  screener2          78 non-null     Int64  
 12  screener3          193 non-null    Int64  
 13  screener4          109 non-null    Int64  
 14  screener5          25 non-null     Int64  
 15  screener6          61 non-null     Int64  
 16  screen_num         245 non-null    Int64  
 17  screen_ave         245 non-null    Float64
 18  screen_hits        245 non-null    Int64  
 19  class_assignments  245 non-null    string 
 20  method_a           122 non-null    string 
 21  level_a            122 non-null    Int64  
 22  subdisc_a          122 non-null    string 
 23  method_b           124 non-null    string 
 24  level_b            124 non-null    Int64  
 25  subdisc_b          124 non-null    string 
 26  method_c           122 non-null    string 
 27  level_c            122 non-null    Int64  
 28  subdisc_c          122 non-null    string 
 29  method_d           122 non-null    string 
 30  level_d            122 non-null    Int64  
 31  subdisc_d          122 non-null    string 
 32  resolver           207 non-null    string 
 33  method_resolved    245 non-null    string 
 34  level_resolved     245 non-null    Int64  
 35  subdisc_resolved   245 non-null    string 
 36  c                  245 non-null    Int64  
 37  d                  245 non-null    Int64  
 38  l                  245 non-null    Int64  
 39  m                  245 non-null    Int64  
 40  o                  245 non-null    Int64  
 41  p                  245 non-null    Int64  
 42  s                  245 non-null    Int64  
 43  t                  245 non-null    Int64  
 44  act                245 non-null    Int64  
 45  dec                245 non-null    Int64  
 46  epi                245 non-null    Int64  
 47  lan                245 non-null    Int64  
 48  log                245 non-null    Int64  
 49  met                245 non-null    Int64  
 50  min                245 non-null    Int64  
 51  oth                245 non-null    Int64  
 52  sci                245 non-null    Int64  
 53  val                245 non-null    Int64  
 54  c_1                245 non-null    int64  
 55  c_2                245 non-null    int64  
 56  d_1                245 non-null    int64  
 57  d_2                245 non-null    int64  
 58  l_1                245 non-null    int64  
 59  l_2                245 non-null    int64  
 60  m_1                245 non-null    int64  
 61  m_2                245 non-null    int64  
 62  o_1                245 non-null    int64  
 63  o_2                245 non-null    int64  
 64  p_1                245 non-null    int64  
 65  p_2                245 non-null    int64  
 66  s_1                245 non-null    int64  
 67  s_2                245 non-null    int64  
 68  t_1                245 non-null    int64  
 69  t_2                245 non-null    int64  
 70  level_1            245 non-null    int64  
 71  level_2            245 non-null    int64  
dtypes: Float64(1), Int64(37), int64(18), string(16)
memory usage: 148.8+ KB


dict = {}
for type in subdisc_list:
    for name in initials.values():
        # Create new indicator (1/0) column for subdisc-type and classifier name, filling nulls with -1
        df7y_hits[f'{type}_{name}'] = df7y_hits[f'subdisc_{name}'].str.contains(type).astype('Int64').fillna(-1)
    
    # Create dictionary entry of list with subdisc_type as key
    column_names = [f'{type}_{name}' for name in initials.values()]
    dict[type] = df7y_hits[column_names].values.tolist()
    print(column_names)
    
    # Drop intermediary columns from DataFrame
    df7y_hits.drop(columns=column_names, inplace=True)
    
    # Remove nulls coded as -1, leaving the list a list of two-element lists
    for x in dict[type]:
        remove_all_occurrences(x, -1)
        
    # Assign the first and second elements from the two-element lists to two new subdisc_type columns
    df7y_hits[f'{type}_1'] = [x[0] for x in dict[type]]
    df7y_hits[f'{type}_2'] = [x[1] for x in dict[type]]

['act_a', 'act_b', 'act_c', 'act_d']
['dec_a', 'dec_b', 'dec_c', 'dec_d']
['epi_a', 'epi_b', 'epi_c', 'epi_d']
['lan_a', 'lan_b', 'lan_c', 'lan_d']
['log_a', 'log_b', 'log_c', 'log_d']
['met_a', 'met_b', 'met_c', 'met_d']
['min_a', 'min_b', 'min_c', 'min_d']
['oth_a', 'oth_b', 'oth_c', 'oth_d']
['sci_a', 'sci_b', 'sci_c', 'sci_d']
['val_a', 'val_b', 'val_c', 'val_d']


#Check
subdisctype_cols = [name for name in df7y_hits.columns.tolist() 
                 if ('subdisc' in name) and ('resolved' not in name)] + \
                [name for name in df7y_hits.columns.tolist() 
                      for type in subdisc_list if name.startswith(f'{type}_')]
df7y_hits[subdisctype_cols].sample(10)


df7y_hits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, CBNREUXB to ATMYJ8IU
Data columns (total 92 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   author             245 non-null    string 
 1   title              245 non-null    string 
 2   year               245 non-null    Int64  
 3   vol                245 non-null    Int64  
 4   iss                245 non-null    string 
 5   pg_start           245 non-null    Int64  
 6   pg_end             245 non-null    Int64  
 7   doi                245 non-null    string 
 8   num_authors        245 non-null    Int64  
 9   discussion         245 non-null    Int64  
 10  screener1          56 non-null     Int64  
 11  screener2          78 non-null     Int64  
 12  screener3          193 non-null    Int64  
 13  screener4          109 non-null    Int64  
 14  screener5          25 non-null     Int64  
 15  screener6          61 non-null     Int64  
 16  screen_num         245 non-null    Int64  
 17  screen_ave         245 non-null    Float64
 18  screen_hits        245 non-null    Int64  
 19  class_assignments  245 non-null    string 
 20  method_a           122 non-null    string 
 21  level_a            122 non-null    Int64  
 22  subdisc_a          122 non-null    string 
 23  method_b           124 non-null    string 
 24  level_b            124 non-null    Int64  
 25  subdisc_b          124 non-null    string 
 26  method_c           122 non-null    string 
 27  level_c            122 non-null    Int64  
 28  subdisc_c          122 non-null    string 
 29  method_d           122 non-null    string 
 30  level_d            122 non-null    Int64  
 31  subdisc_d          122 non-null    string 
 32  resolver           207 non-null    string 
 33  method_resolved    245 non-null    string 
 34  level_resolved     245 non-null    Int64  
 35  subdisc_resolved   245 non-null    string 
 36  c                  245 non-null    Int64  
 37  d                  245 non-null    Int64  
 38  l                  245 non-null    Int64  
 39  m                  245 non-null    Int64  
 40  o                  245 non-null    Int64  
 41  p                  245 non-null    Int64  
 42  s                  245 non-null    Int64  
 43  t                  245 non-null    Int64  
 44  act                245 non-null    Int64  
 45  dec                245 non-null    Int64  
 46  epi                245 non-null    Int64  
 47  lan                245 non-null    Int64  
 48  log                245 non-null    Int64  
 49  met                245 non-null    Int64  
 50  min                245 non-null    Int64  
 51  oth                245 non-null    Int64  
 52  sci                245 non-null    Int64  
 53  val                245 non-null    Int64  
 54  c_1                245 non-null    int64  
 55  c_2                245 non-null    int64  
 56  d_1                245 non-null    int64  
 57  d_2                245 non-null    int64  
 58  l_1                245 non-null    int64  
 59  l_2                245 non-null    int64  
 60  m_1                245 non-null    int64  
 61  m_2                245 non-null    int64  
 62  o_1                245 non-null    int64  
 63  o_2                245 non-null    int64  
 64  p_1                245 non-null    int64  
 65  p_2                245 non-null    int64  
 66  s_1                245 non-null    int64  
 67  s_2                245 non-null    int64  
 68  t_1                245 non-null    int64  
 69  t_2                245 non-null    int64  
 70  level_1            245 non-null    int64  
 71  level_2            245 non-null    int64  
 72  act_1              245 non-null    int64  
 73  act_2              245 non-null    int64  
 74  dec_1              245 non-null    int64  
 75  dec_2              245 non-null    int64  
 76  epi_1              245 non-null    int64  
 77  epi_2              245 non-null    int64  
 78  lan_1              245 non-null    int64  
 79  lan_2              245 non-null    int64  
 80  log_1              245 non-null    int64  
 81  log_2              245 non-null    int64  
 82  met_1              245 non-null    int64  
 83  met_2              245 non-null    int64  
 84  min_1              245 non-null    int64  
 85  min_2              245 non-null    int64  
 86  oth_1              245 non-null    int64  
 87  oth_2              245 non-null    int64  
 88  sci_1              245 non-null    int64  
 89  sci_2              245 non-null    int64  
 90  val_1              245 non-null    int64  
 91  val_2              245 non-null    int64  
dtypes: Float64(1), Int64(37), int64(38), string(16)
memory usage: 187.1+ KB


df7y_hits.to_csv('phil_studies_7y_july_interrater_reliability.csv')


col_dict = {}
for fam in family_list:
    for num in [1, 2]:
        col_dict[f'{fam}_{num}'] = [f'{s}_{num}' for s in fam_dict[fam]]
        print(col_dict[f'{fam}_{num}'])
        
        df7y_hits[f'{fam}_{num}'] = df7y_hits[col_dict[f'{fam}_{num}']].any(axis=1).astype(int)
df7y_hits[col_dict['logfam_1'] + ['logfam_1']].sample(20)

['l_1', 'm_1', 's_1']
['l_2', 'm_2', 's_2']
['c_1', 'd_1', 'p_1', 't_1']
['c_2', 'd_2', 'p_2', 't_2']


# Number of False Positives from Screening Process
(df7y_hits['level_resolved'] == 0).sum()

37


# DataFrame of True Positives
df7y_scores = df7y_hits[df7y_hits['level_resolved'] != 0].loc[:, 'c_1':'probfam_2'].copy()
df7y_scores.head()


df7y_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 208 entries, CBNREUXB to ATMYJ8IU
Data columns (total 42 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   c_1        208 non-null    int64
 1   c_2        208 non-null    int64
 2   d_1        208 non-null    int64
 3   d_2        208 non-null    int64
 4   l_1        208 non-null    int64
 5   l_2        208 non-null    int64
 6   m_1        208 non-null    int64
 7   m_2        208 non-null    int64
 8   o_1        208 non-null    int64
 9   o_2        208 non-null    int64
 10  p_1        208 non-null    int64
 11  p_2        208 non-null    int64
 12  s_1        208 non-null    int64
 13  s_2        208 non-null    int64
 14  t_1        208 non-null    int64
 15  t_2        208 non-null    int64
 16  level_1    208 non-null    int64
 17  level_2    208 non-null    int64
 18  act_1      208 non-null    int64
 19  act_2      208 non-null    int64
 20  dec_1      208 non-null    int64
 21  dec_2      208 non-null    int64
 22  epi_1      208 non-null    int64
 23  epi_2      208 non-null    int64
 24  lan_1      208 non-null    int64
 25  lan_2      208 non-null    int64
 26  log_1      208 non-null    int64
 27  log_2      208 non-null    int64
 28  met_1      208 non-null    int64
 29  met_2      208 non-null    int64
 30  min_1      208 non-null    int64
 31  min_2      208 non-null    int64
 32  oth_1      208 non-null    int64
 33  oth_2      208 non-null    int64
 34  sci_1      208 non-null    int64
 35  sci_2      208 non-null    int64
 36  val_1      208 non-null    int64
 37  val_2      208 non-null    int64
 38  logfam_1   208 non-null    int64
 39  logfam_2   208 non-null    int64
 40  probfam_1  208 non-null    int64
 41  probfam_2  208 non-null    int64
dtypes: int64(42)
memory usage: 69.9+ KB


# DataFrame of True Positives Excluding 1999

df6y_scores = df7y_hits[(df7y_hits['year'] != 1999) & (df7y_hits['level_resolved'] != 0)].loc[:, 'c_1':'probfam_2'].copy()
df6y_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191 entries, H6RKJVMJ to ATMYJ8IU
Data columns (total 42 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   c_1        191 non-null    int64
 1   c_2        191 non-null    int64
 2   d_1        191 non-null    int64
 3   d_2        191 non-null    int64
 4   l_1        191 non-null    int64
 5   l_2        191 non-null    int64
 6   m_1        191 non-null    int64
 7   m_2        191 non-null    int64
 8   o_1        191 non-null    int64
 9   o_2        191 non-null    int64
 10  p_1        191 non-null    int64
 11  p_2        191 non-null    int64
 12  s_1        191 non-null    int64
 13  s_2        191 non-null    int64
 14  t_1        191 non-null    int64
 15  t_2        191 non-null    int64
 16  level_1    191 non-null    int64
 17  level_2    191 non-null    int64
 18  act_1      191 non-null    int64
 19  act_2      191 non-null    int64
 20  dec_1      191 non-null    int64
 21  dec_2      191 non-null    int64
 22  epi_1      191 non-null    int64
 23  epi_2      191 non-null    int64
 24  lan_1      191 non-null    int64
 25  lan_2      191 non-null    int64
 26  log_1      191 non-null    int64
 27  log_2      191 non-null    int64
 28  met_1      191 non-null    int64
 29  met_2      191 non-null    int64
 30  min_1      191 non-null    int64
 31  min_2      191 non-null    int64
 32  oth_1      191 non-null    int64
 33  oth_2      191 non-null    int64
 34  sci_1      191 non-null    int64
 35  sci_2      191 non-null    int64
 36  val_1      191 non-null    int64
 37  val_2      191 non-null    int64
 38  logfam_1   191 non-null    int64
 39  logfam_2   191 non-null    int64
 40  probfam_1  191 non-null    int64
 41  probfam_2  191 non-null    int64
dtypes: int64(42)
memory usage: 64.2+ KB


# DataFrame of Hits Excluding 1999

df6y_hits = df7y_hits[(df7y_hits['year'] != 1999)].loc[:, 'c_1':'probfam_2'].copy()
df6y_hits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 223 entries, H6RKJVMJ to ATMYJ8IU
Data columns (total 42 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   c_1        223 non-null    int64
 1   c_2        223 non-null    int64
 2   d_1        223 non-null    int64
 3   d_2        223 non-null    int64
 4   l_1        223 non-null    int64
 5   l_2        223 non-null    int64
 6   m_1        223 non-null    int64
 7   m_2        223 non-null    int64
 8   o_1        223 non-null    int64
 9   o_2        223 non-null    int64
 10  p_1        223 non-null    int64
 11  p_2        223 non-null    int64
 12  s_1        223 non-null    int64
 13  s_2        223 non-null    int64
 14  t_1        223 non-null    int64
 15  t_2        223 non-null    int64
 16  level_1    223 non-null    int64
 17  level_2    223 non-null    int64
 18  act_1      223 non-null    int64
 19  act_2      223 non-null    int64
 20  dec_1      223 non-null    int64
 21  dec_2      223 non-null    int64
 22  epi_1      223 non-null    int64
 23  epi_2      223 non-null    int64
 24  lan_1      223 non-null    int64
 25  lan_2      223 non-null    int64
 26  log_1      223 non-null    int64
 27  log_2      223 non-null    int64
 28  met_1      223 non-null    int64
 29  met_2      223 non-null    int64
 30  min_1      223 non-null    int64
 31  min_2      223 non-null    int64
 32  oth_1      223 non-null    int64
 33  oth_2      223 non-null    int64
 34  sci_1      223 non-null    int64
 35  sci_2      223 non-null    int64
 36  val_1      223 non-null    int64
 37  val_2      223 non-null    int64
 38  logfam_1   223 non-null    int64
 39  logfam_2   223 non-null    int64
 40  probfam_1  223 non-null    int64
 41  probfam_2  223 non-null    int64
dtypes: int64(42)
memory usage: 74.9+ KB


variable_list = method_list + family_list + ['level'] + subdisc_list
variable_list

['c',
 'd',
 'l',
 'm',
 'o',
 'p',
 's',
 't',
 'logfam',
 'probfam',
 'level',
 'act',
 'dec',
 'epi',
 'lan',
 'log',
 'met',
 'min',
 'oth',
 'sci',
 'val']


for var in variable_list:
    total_ratings = len(df7y_scores)
    agreement = (df7y_scores[f'{var}_1'] == df7y_scores[f'{var}_2']).sum() / total_ratings
    message = f'Percent Agreement for variable {var}: {agreement*100:.0f}'
    print(message)

Percent Agreement for variable c: 98
Percent Agreement for variable d: 95
Percent Agreement for variable l: 76
Percent Agreement for variable m: 86
Percent Agreement for variable o: 95
Percent Agreement for variable p: 93
Percent Agreement for variable s: 86
Percent Agreement for variable t: 100
Percent Agreement for variable logfam: 94
Percent Agreement for variable probfam: 94
Percent Agreement for variable level: 48
Percent Agreement for variable act: 95
Percent Agreement for variable dec: 96
Percent Agreement for variable epi: 91
Percent Agreement for variable lan: 90
Percent Agreement for variable log: 90
Percent Agreement for variable met: 89
Percent Agreement for variable min: 97
Percent Agreement for variable oth: 95
Percent Agreement for variable sci: 96
Percent Agreement for variable val: 95


from sklearn.metrics import cohen_kappa_score


for var in variable_list:
    y1 = df7y_scores[f'{var}_1']
    y2 = df7y_scores[f'{var}_2']
    kappa = cohen_kappa_score(y1, y2)
    message = f'Cohen\'s Kappa for variable {var}: {kappa:.2f}'
    print(message)

Cohen's Kappa for variable c: 0.66
Cohen's Kappa for variable d: 0.74
Cohen's Kappa for variable l: 0.50
Cohen's Kappa for variable m: 0.63
Cohen's Kappa for variable o: 0.56
Cohen's Kappa for variable p: 0.74
Cohen's Kappa for variable s: 0.38
Cohen's Kappa for variable t: 0.93
Cohen's Kappa for variable logfam: 0.85
Cohen's Kappa for variable probfam: 0.87
Cohen's Kappa for variable level: 0.19
Cohen's Kappa for variable act: 0.48
Cohen's Kappa for variable dec: 0.69
Cohen's Kappa for variable epi: 0.73
Cohen's Kappa for variable lan: 0.65
Cohen's Kappa for variable log: 0.55
Cohen's Kappa for variable met: 0.73
Cohen's Kappa for variable min: 0.52
Cohen's Kappa for variable oth: -0.02
Cohen's Kappa for variable sci: 0.75
Cohen's Kappa for variable val: 0.75


for var in variable_list:
    n = len(df7y_scores)
    num_agreement = (df7y_scores[f'{var}_1'] == df7y_scores[f'{var}_2']).sum()
    
    prop_observed_agreement = num_agreement / n
    
    
    num_rat1_3 = (df7y_scores[f'{var}_1'] == 3).sum()
    num_rat1_2 = (df7y_scores[f'{var}_1'] == 2).sum()    
    num_rat1_1 = (df7y_scores[f'{var}_1'] == 1).sum()
    num_rat1_0 = (df7y_scores[f'{var}_1'] == 0).sum()
    
    num_rat2_3 = (df7y_scores[f'{var}_2'] == 3).sum()
    num_rat2_2 = (df7y_scores[f'{var}_2'] == 2).sum()    
    num_rat2_1 = (df7y_scores[f'{var}_2'] == 1).sum()
    num_rat2_0 = (df7y_scores[f'{var}_2'] == 0).sum()
    
    prop_chance_agreement = (num_rat1_3 * num_rat2_3 / n**2) + (num_rat1_2 * num_rat2_2 / n**2) + \
                            (num_rat1_1 * num_rat2_1 / n**2) + (num_rat1_0 * num_rat2_0 / n**2)
    
    kappa = (prop_observed_agreement - prop_chance_agreement) / (1 - prop_chance_agreement)
    
    message = f'Cohen\'s Kappa for variable {var}: {kappa:.2f}'
    print(message)

Cohen's Kappa for variable c: 0.66
Cohen's Kappa for variable d: 0.74
Cohen's Kappa for variable l: 0.50
Cohen's Kappa for variable m: 0.63
Cohen's Kappa for variable o: 0.56
Cohen's Kappa for variable p: 0.74
Cohen's Kappa for variable s: 0.38
Cohen's Kappa for variable t: 0.93
Cohen's Kappa for variable logfam: 0.85
Cohen's Kappa for variable probfam: 0.87
Cohen's Kappa for variable level: 0.19
Cohen's Kappa for variable act: 0.48
Cohen's Kappa for variable dec: 0.69
Cohen's Kappa for variable epi: 0.73
Cohen's Kappa for variable lan: 0.65
Cohen's Kappa for variable log: 0.55
Cohen's Kappa for variable met: 0.73
Cohen's Kappa for variable min: 0.52
Cohen's Kappa for variable oth: -0.02
Cohen's Kappa for variable sci: 0.75
Cohen's Kappa for variable val: 0.75


# Cohen's Kappa Interpretation Function
def kappa_map(kappa):
    if kappa < 0:
        return 'Less than chance'
    if kappa == 0:
        return 'Same as chance'
    if (kappa > 0.0) and (kappa <= 0.2):
        return 'Slight'
    if (kappa > 0.2) and (kappa <= 0.4):
        return 'Fair'
    if (kappa > 0.4) and (kappa <= 0.6):
        return 'Moderate'
    if (kappa > 0.6) and (kappa <= 0.8):
        return 'Substantial'
    if (kappa > 0.8) and (kappa < 1.0):
        return 'Almost perfect'
    if kappa == 1:
        return 'Perfect'


print('Percent Agreement and Cohen\'s Kappa')
print('Years: 1999, 2005, 2007, 2009, 2015, 2017, 2019')
print('Article Classification: \"Passed Screening\" (all levels, including 0)')
print()

row = "| {A:<8s} | {B:>13s} | {C:<6s} | {D:<16s} |".format
print(row(A='variable', B='pct agreement', C='kappa', D='interpretation'))

print(row(A='-'*8, B='-'*13, C='-'*6, D='-'*16))

row = "| {A:>8s} | {B:13.1%} | {C:>6.3f} | {D:<16s} |".format

for var in variable_list:
    # Percent Agreement
    total_ratings = len(df7y_hits)
    agreement = (df7y_hits[f'{var}_1'] == df7y_hits[f'{var}_2']).sum() / total_ratings
    
    # Cohen's Kappa
    y1 = df7y_hits[f'{var}_1']
    y2 = df7y_hits[f'{var}_2']
    kappa = cohen_kappa_score(y1, y2)
    
    interpretation = kappa_map(kappa)
    
    print(row(A=var, B=agreement, C=kappa, D=interpretation))

Percent Agreement and Cohen's Kappa
Years: 1999, 2005, 2007, 2009, 2015, 2017, 2019
Article Classification: "Passed Screening" (all levels, including 0)

| variable | pct agreement | kappa  | interpretation   |
| -------- | ------------- | ------ | ---------------- |
|        c |         98.4% |  0.658 | Substantial      |
|        d |         95.1% |  0.733 | Substantial      |
|        l |         78.0% |  0.534 | Moderate         |
|        m |         85.7% |  0.612 | Substantial      |
|        o |         92.7% |  0.531 | Moderate         |
|        p |         93.5% |  0.733 | Substantial      |
|        s |         84.5% |  0.352 | Fair             |
|        t |         99.6% |  0.939 | Almost perfect   |
|   logfam |         92.2% |  0.815 | Almost perfect   |
|  probfam |         93.9% |  0.853 | Almost perfect   |
|    level |         48.6% |  0.263 | Fair             |
|      act |         95.5% |  0.538 | Moderate         |
|      dec |         96.7% |  0.697 | Substantial      |
|      epi |         90.6% |  0.719 | Substantial      |
|      lan |         90.6% |  0.684 | Substantial      |
|      log |         91.0% |  0.527 | Moderate         |
|      met |         90.2% |  0.766 | Substantial      |
|      min |         95.9% |  0.594 | Moderate         |
|      oth |         95.1% | -0.019 | Less than chance |
|      sci |         94.3% |  0.665 | Substantial      |
|      val |         94.3% |  0.709 | Substantial      |


print('Percent Agreement and Cohen\'s Kappa')
print('Years: 1999, 2005, 2007, 2009, 2015, 2017, 2019')
print('Article Classification:  \"Uses Formal Methods\" (level > 0)')
print()

row = "| {A:<8s} | {B:>13s} | {C:<6s} | {D:<16s} |".format
print(row(A='variable', B='pct agreement', C='kappa', D='interpretation'))

print(row(A='-'*8, B='-'*13, C='-'*6, D='-'*16))

row = "| {A:>8s} | {B:13.1%} | {C:>6.3f} | {D:<16s} |".format

for var in variable_list:
    # Percent Agreement
    total_ratings = len(df7y_scores)
    agreement = (df7y_scores[f'{var}_1'] == df7y_scores[f'{var}_2']).sum() / total_ratings
    
    # Cohen's Kappa
    y1 = df7y_scores[f'{var}_1']
    y2 = df7y_scores[f'{var}_2']
    kappa = cohen_kappa_score(y1, y2)
    
    interpretation = kappa_map(kappa)
    
    print(row(A=var, B=agreement, C=kappa, D=interpretation))

Percent Agreement and Cohen's Kappa
Years: 1999, 2005, 2007, 2009, 2015, 2017, 2019
Article Classification:  "Uses Formal Methods" (level > 0)

| variable | pct agreement | kappa  | interpretation   |
| -------- | ------------- | ------ | ---------------- |
|        c |         98.1% |  0.657 | Substantial      |
|        d |         94.7% |  0.736 | Substantial      |
|        l |         76.4% |  0.496 | Moderate         |
|        m |         85.6% |  0.634 | Substantial      |
|        o |         94.7% |  0.564 | Moderate         |
|        p |         93.3% |  0.742 | Substantial      |
|        s |         85.6% |  0.382 | Fair             |
|        t |         99.5% |  0.931 | Almost perfect   |
|   logfam |         93.8% |  0.850 | Almost perfect   |
|  probfam |         94.2% |  0.868 | Almost perfect   |
|    level |         47.6% |  0.190 | Slight           |
|      act |         95.2% |  0.477 | Moderate         |
|      dec |         96.2% |  0.694 | Substantial      |
|      epi |         90.9% |  0.730 | Substantial      |
|      lan |         89.9% |  0.651 | Substantial      |
|      log |         90.4% |  0.545 | Moderate         |
|      met |         88.9% |  0.735 | Substantial      |
|      min |         96.6% |  0.516 | Moderate         |
|      oth |         95.2% | -0.016 | Less than chance |
|      sci |         95.7% |  0.746 | Substantial      |
|      val |         95.2% |  0.746 | Substantial      |


print('Percent Agreement and Cohen\'s Kappa')
print('Years: 2005, 2007, 2009, 2015, 2017, 2019')
print('Article Classification:  \"Uses Formal Methods\" (level > 0)')
print()

row = "| {A:<8s} | {B:>13s} | {C:<6s} | {D:<16s} |".format
print(row(A='variable', B='pct agreement', C='kappa', D='interpretation'))

print(row(A='-'*8, B='-'*13, C='-'*6, D='-'*16))

row = "| {A:>8s} | {B:13.1%} | {C:>6.3f} | {D:<16s} |".format

for var in variable_list:
    # Percent Agreement
    total_ratings = len(df6y_scores)
    agreement = (df6y_scores[f'{var}_1'] == df6y_scores[f'{var}_2']).sum() / total_ratings
    
    # Cohen's Kappa
    y1 = df6y_scores[f'{var}_1']
    y2 = df6y_scores[f'{var}_2']
    kappa = cohen_kappa_score(y1, y2)
    
    interpretation = kappa_map(kappa)
    
    print(row(A=var, B=agreement, C=kappa, D=interpretation))

Percent Agreement and Cohen's Kappa
Years: 2005, 2007, 2009, 2015, 2017, 2019
Article Classification:  "Uses Formal Methods" (level > 0)

| variable | pct agreement | kappa  | interpretation   |
| -------- | ------------- | ------ | ---------------- |
|        c |         97.9% |  0.656 | Substantial      |
|        d |         94.2% |  0.712 | Substantial      |
|        l |         75.4% |  0.472 | Moderate         |
|        m |         84.3% |  0.609 | Substantial      |
|        o |         94.8% |  0.555 | Moderate         |
|        p |         92.7% |  0.715 | Substantial      |
|        s |         85.3% |  0.379 | Fair             |
|        t |         99.5% |  0.931 | Almost perfect   |
|   logfam |         93.7% |  0.848 | Almost perfect   |
|  probfam |         93.7% |  0.857 | Almost perfect   |
|    level |         48.7% |  0.208 | Fair             |
|      act |         94.8% |  0.475 | Moderate         |
|      dec |         95.8% |  0.670 | Substantial      |
|      epi |         90.6% |  0.736 | Substantial      |
|      lan |         90.1% |  0.630 | Substantial      |
|      log |         90.1% |  0.555 | Moderate         |
|      met |         90.6% |  0.773 | Substantial      |
|      min |         96.9% |  0.555 | Moderate         |
|      oth |         95.8% | -0.016 | Less than chance |
|      sci |         95.8% |  0.711 | Substantial      |
|      val |         95.3% |  0.765 | Substantial      |


print('Percent Agreement and Cohen\'s Kappa')
print('Years: 2005, 2007, 2009, 2015, 2017, 2019')
print('Article Classification: \"Passed Screening\" (all levels, including 0)')
print()

row = "| {A:<8s} | {B:>13s} | {C:<6s} | {D:<16s} |".format
print(row(A='variable', B='pct agreement', C='kappa', D='interpretation'))

print(row(A='-'*8, B='-'*13, C='-'*6, D='-'*16))

row = "| {A:>8s} | {B:13.1%} | {C:>6.3f} | {D:<16s} |".format

for var in variable_list:
    # Percent Agreement
    total_ratings = len(df6y_hits)
    agreement = (df6y_hits[f'{var}_1'] == df6y_hits[f'{var}_2']).sum() / total_ratings
    
    # Cohen's Kappa
    y1 = df6y_hits[f'{var}_1']
    y2 = df6y_hits[f'{var}_2']
    kappa = cohen_kappa_score(y1, y2)
    
    interpretation = kappa_map(kappa)
    
    print(row(A=var, B=agreement, C=kappa, D=interpretation))

Percent Agreement and Cohen's Kappa
Years: 2005, 2007, 2009, 2015, 2017, 2019
Article Classification: "Passed Screening" (all levels, including 0)

| variable | pct agreement | kappa  | interpretation   |
| -------- | ------------- | ------ | ---------------- |
|        c |         98.2% |  0.657 | Substantial      |
|        d |         94.6% |  0.709 | Substantial      |
|        l |         77.1% |  0.516 | Moderate         |
|        m |         84.3% |  0.586 | Moderate         |
|        o |         92.4% |  0.499 | Moderate         |
|        p |         92.8% |  0.708 | Substantial      |
|        s |         84.3% |  0.337 | Fair             |
|        t |         99.6% |  0.931 | Almost perfect   |
|   logfam |         91.9% |  0.806 | Almost perfect   |
|  probfam |         93.3% |  0.839 | Almost perfect   |
|    level |         49.3% |  0.274 | Fair             |
|      act |         95.1% |  0.536 | Moderate         |
|      dec |         96.4% |  0.674 | Substantial      |
|      epi |         90.1% |  0.722 | Substantial      |
|      lan |         90.6% |  0.671 | Substantial      |
|      log |         90.6% |  0.535 | Moderate         |
|      met |         91.5% |  0.793 | Substantial      |
|      min |         96.0% |  0.587 | Moderate         |
|      oth |         95.5% | -0.019 | Less than chance |
|      sci |         94.6% |  0.619 | Substantial      |
|      val |         94.6% |  0.740 | Substantial      |


# Create name dictionary
variable_dict = {
    'c': 'Causal Modeling',
    'd': 'Decision \& Game',
    'l': 'Non-Modal Logic',
    'm': 'Modal Logic',
    'o': 'Other',
    'p': 'Probability Theory',
    's': 'Sets \& Relations',
    't': 'Statistics',
    'logfam': '\\textbf{Logic Family}',
    'probfam': '\\textbf{Probability Family}',
    'act': 'Action', 
    'dec': 'Decision',
    'epi': 'Epistemology',
    'lan': 'Language',
    'log': 'Logic',
    'met': 'Metaphysics',
    'min': 'Mind',
    'oth': 'Other',
    'sci': 'Science',
    'val': 'Value'
}
variable_dict

{'c': 'Causal Modeling',
 'd': 'Decision \\& Game',
 'l': 'Non-Modal Logic',
 'm': 'Modal Logic',
 'o': 'Other',
 'p': 'Probability Theory',
 's': 'Sets \\& Relations',
 't': 'Statistics',
 'logfam': '\\textbf{Logic Family}',
 'probfam': '\\textbf{Probability Family}',
 'act': 'Action',
 'dec': 'Decision',
 'epi': 'Epistemology',
 'lan': 'Language',
 'log': 'Logic',
 'met': 'Metaphysics',
 'min': 'Mind',
 'oth': 'Other',
 'sci': 'Science',
 'val': 'Value'}


method_list2 = ['logfam', 'l', 'm', 's', 'probfam', 'p', 'd', 't', 'c', 'o']


for var in method_list2:
    name = variable_dict[var]
    print(name)

\textbf{Logic Family}
Non-Modal Logic
Modal Logic
Sets \& Relations
\textbf{Probability Family}
Probability Theory
Decision \& Game
Statistics
Causal Modeling
Other


### Method Table - Vertical Headings

# Initialize variables
row_0 = '\\textbf{Method:} '
row_1 = '\% agree: '
row_2 = '$\\kappa$: '

# Calculate values and create rows
for var in method_list2:
    # Create row_0
    name = variable_dict[var]
    entry = f'& \\rotatebox{{90}}{{{name}}} '
    row_0 = row_0 + entry
    


    # Calculate Percent Agreement
    total_ratings = len(df6y_hits)
    agreement = (df6y_hits[f'{var}_1'] == df6y_hits[f'{var}_2']).sum() / total_ratings
    # Create row_1
    entry = f'& {agreement*100:4.0f}\% '
    row_1 = row_1 + entry

    
    # Calculate Cohen's Kappa
    y1 = df6y_hits[f'{var}_1']
    y2 = df6y_hits[f'{var}_2']
    kappa = cohen_kappa_score(y1, y2)
    # Create row_2
    entry = f'& {kappa:>4.2f} '
    row_2 = row_2 + entry


row_0 = row_0 + '\\\\ \hline'
row_1 = row_1 + '\\\\'
row_2 = row_2 + '\\\\'
    
# Print rows
print(row_0)
print(row_1)
print(row_2)

# Print spacing row
print('\\\\')

### Subdiscipline Table

# Initialize variables
row_0 = '\\textbf{Subdisc:} '
row_1 = '\% agree: '
row_2 = '$\\kappa$: '

# Calculate values and create rows
for var in subdisc_list:
    # Create row_0
    name = variable_dict[var]
    entry = f'& \\rotatebox{{90}}{{{name}}} '
    row_0 = row_0 + entry
    


    # Calculate Percent Agreement
    total_ratings = len(df6y_hits)
    agreement = (df6y_hits[f'{var}_1'] == df6y_hits[f'{var}_2']).sum() / total_ratings
    # Create row_1
    entry = f'& {agreement*100:4.0f}\% '
    row_1 = row_1 + entry

    
    # Calculate Cohen's Kappa
    y1 = df6y_hits[f'{var}_1']
    y2 = df6y_hits[f'{var}_2']
    kappa = cohen_kappa_score(y1, y2)
    # Create row_2
    entry = f'& {kappa:>4.2f} '
    row_2 = row_2 + entry


row_0 = row_0 + '\\\\ \hline'
row_1 = row_1 + '\\\\'
row_2 = row_2 + '\\\\'
    
# Print rows
print(row_0)
print(row_1)
print(row_2)

\textbf{Method:} & \rotatebox{90}{\textbf{Logic Family}} & \rotatebox{90}{Non-Modal Logic} & \rotatebox{90}{Modal Logic} & \rotatebox{90}{Sets \& Relations} & \rotatebox{90}{\textbf{Probability Family}} & \rotatebox{90}{Probability Theory} & \rotatebox{90}{Decision \& Game} & \rotatebox{90}{Statistics} & \rotatebox{90}{Causal Modeling} & \rotatebox{90}{Other} \\ \hline
\% agree: &   92\% &   77\% &   84\% &   84\% &   93\% &   93\% &   95\% &  100\% &   98\% &   92\% \\
$\kappa$: & 0.81 & 0.52 & 0.59 & 0.34 & 0.84 & 0.71 & 0.71 & 0.93 & 0.66 & 0.50 \\
\\
\textbf{Subdisc:} & \rotatebox{90}{Action} & \rotatebox{90}{Decision} & \rotatebox{90}{Epistemology} & \rotatebox{90}{Language} & \rotatebox{90}{Logic} & \rotatebox{90}{Metaphysics} & \rotatebox{90}{Mind} & \rotatebox{90}{Other} & \rotatebox{90}{Science} & \rotatebox{90}{Value} \\ \hline
\% agree: &   95\% &   96\% &   90\% &   91\% &   91\% &   91\% &   96\% &   96\% &   95\% &   95\% \\
$\kappa$: & 0.54 & 0.67 & 0.72 & 0.67 & 0.54 & 0.79 & 0.59 & -0.02 & 0.62 & 0.74 \\


### Method Table - Horizontal Headings

# Create header row
row_0 = '\\textbf{Method} & \% agree & $\\kappa$ & \\textbf{Subdiscipline} & \% agree & $\\kappa$ ' + '\\\\ \hline'
print(row_0)

# Calculate values and create rows
for var1, var2 in zip(method_list2, subdisc_list):

    # Calculate Percent Agreements
    total_ratings = len(df6y_hits)
    agreement1 = (df6y_hits[f'{var1}_1'] == df6y_hits[f'{var1}_2']).sum() / total_ratings
    agreement2 = (df6y_hits[f'{var2}_1'] == df6y_hits[f'{var2}_2']).sum() / total_ratings
 
    # Calculate Cohen's Kappas
    y1 = df6y_hits[f'{var1}_1']
    y2 = df6y_hits[f'{var1}_2']
    kappa1 = cohen_kappa_score(y1, y2)
    
    y1 = df6y_hits[f'{var2}_1']
    y2 = df6y_hits[f'{var2}_2']
    kappa2 = cohen_kappa_score(y1, y2)
    
    # Create Row
    name1 = variable_dict[var1]
    name2 = variable_dict[var2]
    
    row = f'{name1} & {agreement1*100:4.0f}\% & {kappa1:>4.2f} '
    row = row + f'& {name2} & {agreement2*100:4.0f}\% & {kappa2:>4.2f}'
    row = row + '\\\\'
    
    # Print row
    print(row)

\textbf{Method} & \% agree & $\kappa$ & \textbf{Subdiscipline} & \% agree & $\kappa$ \\ \hline
\textbf{Logic Family} &   92\% & 0.81 & Action &   95\% & 0.54\\
Non-Modal Logic &   77\% & 0.52 & Decision &   96\% & 0.67\\
Modal Logic &   84\% & 0.59 & Epistemology &   90\% & 0.72\\
Sets \& Relations &   84\% & 0.34 & Language &   91\% & 0.67\\
\textbf{Probability Family} &   93\% & 0.84 & Logic &   91\% & 0.54\\
Probability Theory &   93\% & 0.71 & Metaphysics &   91\% & 0.79\\
Decision \& Game &   95\% & 0.71 & Mind &   96\% & 0.59\\
Statistics &  100\% & 0.93 & Other &   96\% & -0.02\\
Causal Modeling &   98\% & 0.66 & Science &   95\% & 0.62\\
Other &   92\% & 0.50 & Value &   95\% & 0.74\\

	method_a	method_b	method_c	method_d	c_1	c_2	d_1	d_2	l_1	l_2	m_1	m_2	o_1	o_2	p_1	p_2	s_1	s_2	t_1	t_2
key
JRMJBXTE	<NA>	l	l	<NA>	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0
ZW73TCAY	<NA>	l	<NA>	l	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0
8DVHGIW2	<NA>	p	<NA>	p	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0
HBMH5KRN	<NA>	m	l	<NA>	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0
34N4J289	l	<NA>	l	<NA>	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0
RM9FYDFS	<NA>	l	l	<NA>	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0
CBNREUXB	m	<NA>	<NA>	m	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0
3GG84RDD	<NA>	o	s	<NA>	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0
4H7MHEXN	l,s	s	<NA>	<NA>	0	0	0	0	1	0	0	0	0	0	0	0	1	1	0	0
FY4GT48F	<NA>	d	<NA>	d	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0

	level_a	level_b	level_c	level_d	level_resolved
key
4D3TZJPQ	<NA>	<NA>	2	1	2
2FMAARAV	<NA>	<NA>	2	1	1
LGCUXLBZ	<NA>	1	0	<NA>	1
9Q3AYBFL	<NA>	1	<NA>	2	2
MLG4PBUT	3	<NA>	<NA>	2	2
NID655PT	1	<NA>	<NA>	2	1
LNEH6RWT	3	<NA>	2	<NA>	2
4H7MHEXN	3	2	<NA>	<NA>	2
N7Z6M74Y	2	<NA>	<NA>	1	2
9AIJNA2I	<NA>	2	2	<NA>	2

	level_a	level_b	level_c	level_d	level_1	level_2
key
DZML2IQ8	<NA>	2	3	<NA>	2	3
Q7LHHJPN	<NA>	2	2	<NA>	2	2
335Q4Y38	2	<NA>	<NA>	2	2	2
NJGZDVMB	<NA>	1	2	<NA>	1	2
52Y9VVM5	<NA>	1	<NA>	1	1	1
58KUQXV7	<NA>	2	<NA>	3	2	3
GK2ZTTIT	2	<NA>	1	<NA>	2	1
VYGD4WQL	<NA>	<NA>	2	1	2	1
6J9Q847L	<NA>	<NA>	0	0	0	0
UPK6W7S4	<NA>	<NA>	2	2	2	2

	subdisc_a	subdisc_b	subdisc_c	subdisc_d	act_1	act_2	dec_1	dec_2	epi_1	epi_2	...	met_1	met_2	min_1	min_2	oth_1	oth_2	sci_1	sci_2	val_1	val_2
key
FWZ6NIZ7	<NA>	lan,val	<NA>	log	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
PNZ3USDV	lan	<NA>	<NA>	lan	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
YKLVZG5U	<NA>	epi	met	<NA>	0	0	0	0	1	0	...	0	1	0	0	0	0	0	0	0	0
JR57RLCD	dec	<NA>	<NA>	dec	0	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0
KZFQUSMH	sci	met,sci	<NA>	<NA>	0	0	0	0	0	0	...	0	1	0	0	0	0	1	1	0	0
XBYAERZM	dec	<NA>	<NA>	dec	0	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0
LF3KLWW5	<NA>	lan,met	<NA>	lan	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
FY4GT48F	<NA>	dec,epi	<NA>	dec	0	0	1	1	1	0	...	0	0	0	0	0	0	0	0	0	0
U3V5KHIS	<NA>	epi	<NA>	oth	0	0	0	0	1	0	...	0	0	0	0	0	1	0	0	0	0
25H879BT	val	<NA>	val	<NA>	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	1

	l_1	m_1	s_1	logfam_1
key
HBF6JF5Q	0	0	0	0
H6RKJVMJ	1	0	0	1
9AIJNA2I	0	0	0	0
8DVHGIW2	0	0	0	0
FG4IWFVH	0	1	0	1
D6UICPGV	0	1	0	1
XNNLU3H8	0	0	0	0
4ATMG9UH	0	1	0	1
AH9T8JGZ	1	0	0	1
YDU5QH5G	0	0	0	0
D9442GJQ	1	0	0	1
6ILT8FK5	1	0	0	1
VLXN8Q9Y	1	0	0	1
UCTK9E2M	0	0	0	0
46MNFIGM	0	0	0	0
25CXATIR	0	1	0	1
N7Z6M74Y	0	1	0	1
JRMJBXTE	1	0	0	1
VYR5FP29	0	1	0	1
59ZIQQ3V	0	1	1	1

Formal Methods Project 2020-2021¶

Interrater Reliability¶

Author: Brian Woodcock¶

Functions¶

Load Master Dataset¶

Create Hits DataFrame¶

Create Two Columns of Classifier Method Ratings for Each Method_Type¶

Create Two Columns of Classifier Level Ratings¶

Create Two Columns of Classifier Subdiscipline Ratings for Each Subdiscipline Type¶

Save¶

Preparation for Interrater Reliability¶

Add columns for logfam_1, logfam_2, probfam_1, probfam_2¶

Calculate Interrater Reliabilities -- Percent Agreement¶

Calculate Interrater Reliabilities -- Cohen's Kappa from Sklearn¶

Calculate Interrater Reliabilities -- Cohen's Kappa from Formula¶

Interrater Reliabilities Table¶

Agreement Table¶

	author	title	year	vol	iss	pg_start	pg_end	doi	num_authors	discussion	...	act	dec	epi	lan	log	met	min	oth	sci	val
key
CBNREUXB	Kvanvig, J. L.	Truth and Superassertibility	1999	93	1	1	19	10.1023/A:1004200827651	1	0	...	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
XA96BKUT	Gold, Ian	Dispositions and the Central Problem of Color	1999	93	1	21	44	10.1023/A:1004215111799	1	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	c_1	c_2	d_1	d_2	l_1	l_2	m_1	m_2	o_1	o_2	...	oth_1	oth_2	sci_1	sci_2	val_1	val_2	logfam_1	logfam_2	probfam_1	probfam_2
key
CBNREUXB	0	0	0	0	0	0	1	1	0	0	...	0	0	0	0	0	0	1	1	0	0
GENQLD4K	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	1	1	0	0
42ZY7DJT	0	0	0	0	1	1	0	0	0	0	...	0	0	0	0	0	0	1	1	0	0
S6WEG9L7	0	0	0	0	0	0	0	0	1	0	...	0	1	0	0	0	0	0	1	0	0
M5DAF3ZF	0	0	0	0	1	1	0	0	0	0	...	0	0	0	0	0	0	1	1	0	0