Source code for hmis.selection

import numpy as np
from hmis.general import calc_age
from datetime import datetime

################################################################################
# Gets IDs within a certain age range.
################################################################################
[docs]def select_by_age(master_dictionary,lo=0, hi=1e9, date_to_calc_age=None):
    """ 
    This function returns the dictionaries of the individuals within the age range. 
    
    Args:
        **master_dictionary** (list): Full list of the dictionaries.

        **lo** (int): The lower bound of the targeted age range.
        *Defaults to: 0*
        
        **hi** (int): The upper bound of the targeted age range.
        *Defaults to: 1e9*
        
    Returns: 
        **dictionary_subset** (list): The list of dictionaries of the individuals that are within the age range. 
        
    """
    
    # Put the date_to_calc_age into a datetime.datetime object
    if date_to_calc_age is not None:
        if type(date_to_calc_age) == str:
            date_to_calc_age = get_date_from_string(date_to_calc_age)
    else:
        date_to_calc_age = datetime.now()

    # Gets the personal IDs within the age range specified. 
    personal_IDs=[]
    
    for num,ind in enumerate(master_dictionary):
        age = calc_age(ind['DOB'],date_to_calc_age)
        age = age.days/365.0 # Convert to years as float
        if age>=lo and age<=hi:
            personal_IDs.append(ind['Personal ID'])
    personal_IDs=np.unique(personal_IDs)
    personal_IDs.sort()
    print("%d people have been selected." % (len(personal_IDs)))
    
    dictionary_subset = subset_from_dictionary(personal_IDs,master_dictionary)
    
    return dictionary_subset




################################################################################
# Gets information from the selected personal IDs passed through
################################################################################
[docs]def subset_from_dictionary(personal_IDs,full_dictionary,matching_key='Personal ID'):
    """ This function gets the subset of dictionaries from the personal IDs that are passed in.
    
    Args:
        **personal_IDs** (array): The list of personal IDs to get the dictionaries. 
        
        **full_dictionary** (list): The full list of dictionaries that has been made.
        
        **matching_key** (string): The key that determines the cross referencing between the files. 
        *Defaults to: 'Personal ID'*
    
    Returns: 
        **inds** (list): The subset of dictionaries with the personal IDs inputted. 
        
    """

    inds = []
    
    for pid in personal_IDs:
        for client in full_dictionary:
            if client[matching_key]==pid:
                inds.append(client)
                break

    return inds




# ASK CARES FOLKS IF THIS SHOULD BE NAMED BY PROGRAMS OR PROJECTS
[docs]def select_by_number_of_programs(master_dictionary, num_of_programs):
    """ 
    This function returns the dictionaries of the individuals that have at least the number of programs entered. 
    
    
    Args:
        **master_dictionary** (list): Full list of the dictionaries.
        
        **num_of_programs** (int): The lower number to how many programs an individual must have to be returned.
        
    Returns: 
        **dictionary_subset** (list): The list of dictionaries of the individuals that have at least the number of programs inputted.
        
    """
    
    personal_IDs = []
    for num,ind in enumerate(master_dictionary):
        prog_list = ind['Programs']
        if len(prog_list) > (num_of_programs -1):
            personal_IDs.append(ind['Personal ID'])
                                
    personal_IDs=np.unique(personal_IDs)
    personal_IDs.sort()
    print((len(personal_IDs)))
    
    dictionary_subset = subset_from_dictionary(personal_IDs,master_dictionary)
    
    return dictionary_subset
    
    
    
[docs]def select_by_program_type(master_dictionary, prog_type):
    """ 
    This function returns the dictionaries of the individuals that have stayed at the inputted program type.
    
    
    Args:
        **master_dictionary** (list): Full list of the dictionaries.
        
        **prog_type** (str): The type of prgram that the individual must have stayed at.
        
    Returns: 
        **dictionary_subset** (list): The list of dictionaries of the individuals that have 
        
    """
    
    personal_IDs = []
    for num, ind in enumerate(master_dictionary):
        prog_list = ind['Programs']
        
        for p in prog_list:
            if (p['Project type'] == prog_type):
                personal_IDs.append(ind['Personal ID'])
    
    
    personal_IDs=np.unique(personal_IDs)
    personal_IDs.sort()
    print((len(personal_IDs)))
    
    dictionary_subset = subset_from_dictionary(personal_IDs,master_dictionary)
    
    return dictionary_subset
    
    
################################################################################
# Get information from the original data
################################################################################
[docs]def get_additional_info(IDs,idtype='Personal',org_data=None,info=None):
    """ This function gets additional information on an individual,
    project, or an indiviuals entry into a project based on their PersonalID,
    ProjectID, or ProjectEntryID respectively.
    
    Args:
        **IDs** (list or string): The list of IDs as strings or a single ID.
        
        **idtype** (string): 'Personal' or 'Project' or 'ProjectEntry' which 
        tells the program what type of data to retrieve.

        **org_data**: (dictionary of Panda data frames) This is the output of the 
        read_in_data command. 

        **info** (list or string): This is a string or list of strings, where
        the strings are the headers of the Pandas dataframes and the information
        to be returned. 

    Return:
        **information** (dictionary) This is a dictionary with the keys representing
        the IDs passed in and the values are dictionaries with those keys being
        the different pieces of information passed in with the info variable. 

     """

    # Error checking
    if idtype != 'Personal' and idtype != 'Project' and idtype != 'ProjectEntry':
        print("type must be \'Personal\' or \'Project\' or \'ProjectEntry\'!!!")
        print("Instead, idtype is %s" % (idtype))
        print("Returning from get_additional_info() without doing anything")
        return None

    if org_data is None:
        print("org_data must be passed in!")
        print("Instead, org_data is %s" % (org_data))
        print("This is the original data as returned by the read_in_data() function")
        print("Returning from get_additional_info() without doing anything")
        return None

    if info is None:
        print("info must be passed in!")
        print("Instead, info is %s" % (info))
        print("This should be a header or headers (as a list) for the original files.")
        print("Returning from get_additional_info() without doing anything")
        return None


    # Get the list of original .csv files from which we'll look for this info. 
    # We can add to this list later, if there is interest.
    list_of_files = []
    idkey = "%sID" % (idtype)
    if idtype=='Personal':
        list_of_files.append('Client')
    elif idtype=='ProjectEntry':
        list_of_files.append('Enrollment')
        list_of_files.append('Exit')
    elif idtype=='Project':
        list_of_files.append('Site')
        list_of_files.append('Project')

    if type(IDs)==str:
        IDs = [IDs]

    if type(info)==str:
        info = [info]

    # Check that the info keys are actually in the headers, including the idkey
    # which will be PersonalID or ProjectID or ProjectEntry.
    for header in info + [idkey]:

        found_header = False
        for name in list_of_files:

            # List of headers from dataframe
            headers = list(org_data[name].columns.values)

            if header in headers:
                found_header = True
                break

        if found_header==False:
            print("%s not found in any of the headers in the files!" % (header))
            print("Returning from get_additional_info() without doing anything")
            return None

    values = {}

    for ID in IDs:

        # For the person or project
        values[ID] = {}

        for header in info:

            # Loop over the different files in which to look.
            for name in list_of_files:
                filedata = org_data[name]
                # We are going to assume that the ID only appears once!
                index = filedata[filedata[idkey] == ID].index.tolist()

                if len(index)==1:
                    index = index[0]
                elif len(index)==0:
                    break
                else:
                    print("%s appears more than once in the %s file!!!" % (header, name))
                    print("Using only the first appearence, but this might not be right!")
                    index = index[0]

                if header in list(filedata.columns.values):
                    value = filedata.iloc[index][header] 
                    if value != value:
                        value = "EMPTY"
                    values[ID][header] = value

    return values