Source code for violin.numeric

"""
numeric.py

Handles the element finding and comparison functions for VIOLIN
Created November 2019 - Casey Hansen MeLoDy Lab
Updated June 2025 - Haomiao Luo
"""

import pandas as pd
from typing import Union, List


[docs]def get_attributes(A_idx: int, B_idx: int, sign: str, model_df: pd.DataFrame, attrs: list, path: bool=False) -> dict: """ The function gets the attributes of the interaction in model, available attributes includes [Regulator Compartment, Regulator Compartment ID, Regulated Compartment, Regulated Compartment ID, Mechansim, Site, Cell Line, Cell Type, Tissue Type, Organism]. If Regulator Compartment is selected, Regulator Compartment ID will also be selected. Parameters ---------- A_idx: int A row index of element A in the input model dataframe. B_idx: int A row index of element B in the input model dataframe. sign: str A sign of the interaction, available options: 'positive' or 'negative'. model_df: pd.DataFrame A DataFrame of a model with BioRECIPE format. attrs: list An attributes list for interactions file. path: bool An indicator if it is path interaction. Attributes will be empty if only path is found in model. Returns ------- model_atts: dict An dict of attributes for a model interaction. """ model_attrs = {attr: x for attr, x in zip(attrs, ['nan'] * len(attrs))} # Check if user input redundant attributes if set(attrs).issubset({'Regulator Compartment', 'Regulator Compartment ID', 'Regulated Compartment', 'Regulated Compartment ID', 'Mechanism', 'Site', 'Cell Line', 'Cell Type', 'Tissue Type', 'Organism', }): pass else: raise ValueError('acceptable attributes ' 'Regulator Compartment, Regulator Compartment ID,' 'Regulated Compartment, Regulated Compartment ID,' 'Mechanism, Site,' 'Cell Line, Cell Type, Tissue Type, Organism') # For influence attributes if path: # influence attributes will be empty if only path is found in model pass else: assert (sign in ['Positive', 'Negative']) source_position = model_df.loc[A_idx, f'{sign} Regulator List'].split(',').index( model_df.loc[B_idx, 'Listname']) for a in ['Mechanism', 'Site']: if a in attrs: if model_df.at[A_idx, f'{sign} {a} List'] != 'nan' and \ model_df.loc[A_idx, f'{sign} {a} List'].split(',')[source_position] not in ['none', 'nan', '']: model_attrs[a] = model_df.loc[A_idx, f'{sign} {a} List'].split(',')[source_position] else: pass else: pass # For context attributes for a in ['Cell Line', 'Cell Type', 'Tissue Type', 'Organism']: if a in attrs: if model_df.at[A_idx, a] not in ['none', 'nan', '']: model_attrs[a] = model_df.at[A_idx, a] else: pass else: pass # For element attributes if 'Regulated Compartment' in attrs: A_location = model_df.loc[A_idx, 'Compartment'] A_location_id = model_df.loc[A_idx, 'Compartment ID'] model_attrs['Regulated Compartment'] = A_location if A_location.lower() not in ['none', 'nan', ''] else 'nan' model_attrs['Regulated Compartment ID'] = A_location_id if A_location_id.lower() not in ['none', 'nan', ''] else 'nan' if 'Regulator Compartment' in attrs: B_location = model_df.loc[B_idx, 'Compartment'] B_location_id = model_df.loc[B_idx, 'Compartment ID'] model_attrs['Regulator Compartment'] = B_location if B_location.lower() not in ['none', 'nan', ''] else 'nan' model_attrs['Regulator Compartment ID'] = B_location_id if B_location_id.lower() not in ['none', 'nan', ''] else 'nan' return model_attrs
[docs]def find_element(search_type: str, element_name: str, element_type: str, model_df: pd.DataFrame, id_db: str=None ) -> Union[List, int]: """ This function finds the correct indices of an element within the model. Because elements can exist as multiple types (protein, RNA, gene, etc.), this function checks the element name/ID along with the element type. Function may return a list, if a given element of a specific type exists with varying attributes (such as different locations). Parameters ---------- search_type : str An identifier of the element, available options are 'hgnc', 'name', and 'id'. element_name: str A name (or ID) of the element being searched for. element_type: str A type of element ('protein', 'protein family', etc.) model_df: pd.DataFrame A model dataframe within BioRECIPE format. id_db: str A database name for provided identifier. Returns ------- location : list|int All row indices of the model spreadsheet in which the element is found (returns -1 if not found). """ # Searching for element by HGNC symbol if search_type == "hgnc": # indices of all instances of an element in the model indices = [i for i, x in enumerate(list(model_df['Element HGNC Symbol'])) if element_name in x and element_name != 'nan'] # Searching for element by name elif search_type == "name": # indices of all instances of an element in the model indices = [i for i, x in enumerate(list(model_df['Element Name'])) if element_name in x and element_name != 'nan'] # Searching for element by ID elif search_type == "id": # indices of all instances of an element in the model indices = [i for i, x in enumerate(list(model_df['Element IDs'])) if (element_name in x) \ and (element_name != 'nan') and (id_db == model_df.loc[i, 'Element Database'])] else: indices = [] indices_list = [] # Searching for matching element type for idx in indices: if model_df.loc[idx, "Element Type"] == element_type: indices_list.append(idx) # If element has been found, return a list of its locations within the model if len(indices_list) > 0: return indices_list # Value -1: means element not found else: return -1
[docs]def compare(model_atts: dict, reading_atts: dict) -> int: """ Compares a list of model attributes to the corresponding interaction attributes, returns numeric value - Attributes are the same (strong corroboration): 0 - Some or all LEE attributes are missing (weak corroboration): 1 - Some or all of the model attributes are missing (specification): 2 - One or more model attribute differs from the LEE attributes (contradiction): 3 Parameters ---------- model_atts : dict A dictionary of attributes for a model interaction. reading_atts : dict A dictionary of attributes for an event from a literactions list. Returns ------- value : int The numerical representation of comparison outcome. """ # First, individually compare each attribute between the machine reading output and model # Add outcome to list compare_atts = [] s_location_atts, t_location_atts = [], [] for model, reading in zip(model_atts.keys(), reading_atts.keys()): # Check compartment information if reading in ['Regulated Compartment', 'Regulated Compartment ID']: if model_atts[model] == reading_atts[reading]: t_location_atts.append(0) elif (model_atts[model] == "nan") and (reading_atts[reading] == "nan"): t_location_atts.append(0) # Reading attribute is not available elif (model_atts[model] != "nan") and (reading_atts[reading] == "nan"): t_location_atts.append(1) # Model attribute is not available elif (model_atts[model] == "nan") and (reading_atts[reading] != "nan"): t_location_atts.append(2) # Else: both model and reading attributes are available, but they differ else: t_location_atts.append(3) elif reading in ['Regulator Compartment', 'Regulator Compartment ID']: if model_atts[model] == reading_atts[reading]: s_location_atts.append(0) elif (model_atts[model] == "nan") and (reading_atts[reading] == "nan"): s_location_atts.append(0) # Reading attribute is not available elif (model_atts[model] != "nan") and (reading_atts[reading] == "nan"): s_location_atts.append(1) # Model attribute is not available elif (model_atts[model] == "nan") and (reading_atts[reading] != "nan"): s_location_atts.append(2) # Else: both model and reading attributes are available, but they differ else: s_location_atts.append(3) # Check other attributes else: if model_atts[model] == reading_atts[reading]: compare_atts.append(0) elif (model_atts[model] == "nan") and (reading_atts[reading] == "nan"): compare_atts.append(0) # Reading attribute is not available elif (model_atts[model] != "nan") and (reading_atts[reading] == "nan"): compare_atts.append(1) # Model attribute is not available elif (model_atts[model] == "nan") and (reading_atts[reading] != "nan"): compare_atts.append(2) # Else: both model and reading attributes are available, but they differ else: compare_atts.append(3) # Check if compartment ID is matched or not s_location_atts = [0] * len(s_location_atts) if any(x == 0 for x in s_location_atts) else s_location_atts t_location_atts = [0] * len(t_location_atts) if any(x == 0 for x in t_location_atts) else t_location_atts compare_atts += s_location_atts compare_atts += t_location_atts # Final outcome determined by the set of comparisons in compare_atts[] # Strong Corroboration - perfect match over all attributes if 0 in compare_atts and len(set(compare_atts)) == 1: value = 0 # Weak Corroboration - model contains more information than the reading for all attributes elif 1 in compare_atts and len(set(compare_atts)) == 1: value = 1 # Weak Corroboration - attributes either match or the model contains more information than the LEE elif 0 in compare_atts and 1 in compare_atts and len(set(compare_atts)) == 2: value = 1 # Specification - LEE contains one or more attributes the model does not elif 2 in compare_atts and len(set(compare_atts)) == 1: value = 2 # Specification - attributes either match or the LEE contains additional attributes elif 0 in compare_atts and 2 in compare_atts and len(set(compare_atts)) == 2: value = 2 # Specification - model or LEE contains more information, depending on attribute elif 1 in compare_atts and 2 in compare_atts and len(set(compare_atts)) == 2: value = 2 # Contradiction - some or all of the attributes differ elif 3 in compare_atts: value = 3 # Specification - combination of perfect match, model contains more information, reading contains more information else: value = 2 return value