"""
formatting.py
Handles the model and reading formatting functions for VIOLIN
Created November 2019 - Casey Hansen MeLoDy Lab
Updated May 2024 - Haomiao Luo
"""
from typing import Union, List
import logging
import re
import httplib2 as http
import json
import time
import pandas as pd
pd.options.mode.chained_assignment = None
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
headers = {
"Accept": 'application/json'
}
h = http.Http()
# define regex for valid characters in variable names
_VALID_CHARS = r'a-zA-Z0-9\_'
# valid element types
CANONICAL_TYPES = [
'protein',
'proteinfamily', 'proteincomplex', 'proteinfamilyproteincomplex',
'rna', 'mrna', 'microrna', 'trna', # RNA
'chemical', 'simplechemical', 'chemicalcompound', 'compound', # chemical
'biologicalprocess', 'bioprocess', # biological process
# 'mutation', 'geneticmutation', # mutation
]
CANONICAL_PROTEIN = [
'protein', 'proteinfamily', 'proteincomplex', 'proteinfamilyproteincomplex'
]
CANONICAL_CHEMICAL = [
'chemical', 'simplechemical', 'chemicalcompound', 'compound', 'chemicalfamily'
]
CANONICAL_BIOPROCESS = [
'biologicalprocess', 'bioprocess'
]
CANONICAL_RNA = [
'rna', 'mrna', 'microrna', 'trna'
]
_VAR_COL = 'Variable'
_IDX_COL = '#'
REQUIRED_MODEL = ['Element Name', 'Element Type', 'Element IDs', 'Variable', 'Positive Regulator List',
'Negative Regulator List']
TYPE_ABBR_DICT = {
'proteinfamily': 'pf',
'proteincomplex': 'pf',
'protein': 'pn',
'chemical': 'che',
'chemicalfamily': 'cf',
'gene': 'gene',
'rna': 'rna',
'mutation': 'mut',
'biologicalprocess': 'bp'
}
SUBTYPE_ABBR_DICT = {
"enzyme": "enz",
"transporter": "tsp",
"transcription-factor": "tsf",
"transcription-repressor": "tsr",
"transducer": "tsd",
"kinase": "kin",
"interferon": "ifn",
"interleukin": "ilk",
"subunit": "sub",
"cytokine": "cyt",
"tyrosine": "tyr",
"receptor": "rec",
"caspase": "cas",
"phosphatase": "pho",
"adaptor": 'ada',
"peptidase": "pep",
"cyclin": "cyc",
"growth-factor": "gwf",
"binding": "bin",
"molecule": "mol",
"oncogene": "onc",
"proto-oncogene": "pnc",
"suppressor": "sup",
"tumor": "tum",
"signaling": "sig",
"biological": "bio",
"process": "prc",
"protein": "prt",
"redox": "red",
"metallopeptidase": "mtp",
"nonhistone": "nhs",
"nucleoprotein": "ncp",
"hormone": "hor",
"ligase": 'lgs',
"ligand": "lgd",
"regulator": "reg",
"ubiquitin-protein": "ubp",
"catalytic": "cat",
"gtpase": "gtp",
"reverse": "rvs",
"transcriptase": "tst",
"dehydrogenase": "dhy",
"hydrogenase": "hyd",
"peroxidase": "pox",
"oxidase": "oxi",
"glycoprotein": "glp",
"necrosis-factor": "nec",
"apoptosis": "apo",
"active": "act"
}
[docs]def evidence_score(reading_df: pd.DataFrame, col_names: list) -> pd.DataFrame:
"""
This function merges duplicate interactions and calculates evidence score of each interaction.
Parameters
----------
reading_df : pd.DataFrame
A dataframe of the interaction list with BioRECIPE format.
col_names: list
A list of column headings used to determine if interactions are identical.
Returns
-------
counted_reading : pd.DataFrame
A new dataframe with the evidence count and PMCID list for each interaction.
"""
# Convert reading to lower case, to prevent issues with case difference
reading = reading_df.apply(lambda x: x.astype(str).str.lower())
# The columns that aren't used to determine duplicates (such as Paper ID or Evidence Text)
remainder = [x for x in reading_df.columns if x not in col_names]
# As VIOLIN identifies duplicates, it merges duplicate attributes into a list
counted_reading = reading.groupby(col_names)[remainder[0]].apply(list).reset_index(name=remainder[0])
for x in range(1, len(remainder)):
sub = reading.groupby(col_names)[remainder[x]].apply(list).reset_index(name=remainder[x])
counted_reading[remainder[x]] = sub[remainder[x]]
# Counting the number of duplicates
counted_reading['Evidence Score'] = counted_reading[remainder[0]].str.len()
return counted_reading
[docs]def add_regulator_names_id(model_df: pd.DataFrame) -> pd.DataFrame:
"""
This function converts the model regulator lists from 'variables' to the common element names and database
identifiers.
Parameters
----------
model_df : pd.DataFrame
The model dataframe (in BioRECIPE format).
Returns
-------
model_df : pd.DataFrame
A new dataframe with added columns containing the positive and negative regulators listed
by their Element Names and IDs.
"""
# removes the initial values from the model dataframe, as they're not needed
# Also adds new columns for the positive and negative regulator names and IDs
col_headers = list(model_df.columns)
model_df = model_df[col_headers]
reg_col_list = ['Positive Regulator List', 'Negative Regulator List']
model_df[reg_col_list] = model_df[reg_col_list].apply(lambda x: x.astype(str).str.lower())
# Columns for positive
model_df['Positive Names'] = pd.Series().astype(object)
model_df['Positive IDs'] = pd.Series().astype(object)
model_df['Negative Names'] = pd.Series().astype(object)
model_df['Negative IDs'] = pd.Series().astype(object)
# Convert Regulators
for sign in ['Negative', 'Positive']:
for y in range(model_df.shape[0]):
if model_df[sign + ' Regulator List'][y] in ['', "nan"]:
model_df.at[y, sign + ' Names'] = "nan"
model_df.at[y, sign + ' IDs'] = "nan"
else:
reg_name = model_df[sign + " Regulator List"][y].split(",")
if '' in reg_name:
reg_name.remove('')
reg_id = []
reg_var = reg_name.copy()
model_df.at[y, sign + ' Regulator List'] = reg_var
# find index for regulator in variable column, and copy the Element Name and IDs to the new columns
for element in reg_name:
idx = list(model_df['Listname']).index(element)
reg_name[reg_name.index(element)] = model_df['Element Name'][idx]
# idx = list(model_df["Element Name"]).index(element)
# Since there are multiple IDs for each element, need to keep track of which
# IDs go with which regulator
reg_id.append(model_df["Element IDs"][idx])
model_df.at[y, sign + ' Names'] = reg_name
model_df.at[y, sign + ' IDs'] = reg_id
return model_df
def format_variable_names(model: pd.DataFrame) -> pd.DataFrame:
"""
This function formats model variable names to make them compatible with model checking.
Parameters
----------
model: pd.DataFrame
A dataframe of model file.
Returns
-------
model: pd.DataFrame
A model dataframe with standardized variable names.
"""
global _VALID_CHARS
global _VAR_COL
# remove whitespace in variable names
model[_VAR_COL] = model[_VAR_COL].str.strip()
# collect invalid element names in a list, so they can be removed everywhere in the model
# find invalid characters in element names and names starting with numbers
invalid_names = [
x for x in model[_VAR_COL]
if re.search(r'(^[0-9]+)', x.strip()) or re.search(r'([^' + _VALID_CHARS + ']+)', x.strip())
]
if len(invalid_names) > 0:
logging.info('Formatting variable names: ')
# remove invalid characters at the start of the variable name
replace_names = [re.sub(r'^[^' + _VALID_CHARS + ']+', '', x) for x in invalid_names]
# replace invalid characters elsewhere in variable names
replace_names = [re.sub(r'[^' + _VALID_CHARS + ']+', '_', x) for x in replace_names]
# add ELE_ at the beginning of names starting with numbers
replace_names = [re.sub(r'(^[0-9]+)', 'ELE_\\1', x) for x in replace_names]
name_pairs = zip(invalid_names, replace_names)
for (invalid_name, replace_name) in name_pairs:
logging.info('%s -> %s' % (invalid_name, replace_name))
model.replace(re.escape(invalid_name), re.escape(replace_name), regex=True, inplace=True)
return model
def get_type(input_type: str) -> str:
"""
This function standardizes element types.
Parameters
----------
input_type: str
An type of an element.
Returns
-------
A standardized type to describe the type of element.
"""
global CANONICAL_TYPES
global CANONICAL_PROTEIN
global CANONICAL_CHEMICAL
global CANONICAL_BIOPROCESS
global CANONICAL_RNA
input_type = ''.join(re.findall(r'[A-z]+', input_type)) if str(input_type) != 'nan' else 'other'
if input_type in CANONICAL_TYPES:
if input_type in CANONICAL_PROTEIN:
return 'protein'
elif input_type in CANONICAL_CHEMICAL:
return 'chemical'
elif input_type in CANONICAL_BIOPROCESS:
return 'bioprocess'
elif input_type in CANONICAL_RNA:
return 'rna'
else:
return 'other'
def get_hgnc_symbol(hgnc_id: str, url: str='https://rest.genenames.org/fetch/hgnc_id') -> str:
"""
The function to fetch hgnc symbol by hgnc identifier.
Parameters
----------
hgnc_id: str
A string of the hgnc identifier to search.
url: str
The url of HUGO Gene Nomenclature Committee.
Returns
-------
hgnc_symbol:
The string of hgnc symbol.
"""
response, content = h.request(
url + f'/{hgnc_id}',
'GET',
'',
headers
)
data = json.loads(content)
status_code = False
i = 0
symbol = ''
while status_code is not True and i < 10:
try:
i += 1
response, content = h.request(
url + f'/{hgnc_id}',
'GET',
'',
headers)
if response['status'] == '200':
symbol = data['response']['docs'][0]['symbol']
status_code = True
else:
pass
except Exception as e:
print(e)
time.sleep(1)
return symbol
def get_element(reg_rule: Union[str, list], layer: int) -> list:
"""
This function parses the regulation rule and disentangle the symbol operators
converting rule to a list of regulators.
Parameters
----------
reg_rule: str, list
A regulation rule. for further info of regulation rule, please check:
https://melody-biorecipe.readthedocs.io/en/latest/model_representation.html.
layer: int
a counter for recursive time, the default is 0.
Returns
-------
regulator_list: list
A list of regulators
"""
if reg_rule:
regulator_list = []
if '+' not in reg_rule:
reg_list = split_comma_out_parentheses(reg_rule)
else:
if ',' in reg_rule:
raise ValueError(
'Found mixed commas and plus sign in regulation function'
)
elif reg_rule[-1] == '+':
raise ValueError(
'Regulation rule is not correct'
)
else:
reg_list = reg_rule.split('+')
for reg_element in reg_list:
if reg_element[0] == '{' and reg_element[-1] == '}':
assert (layer == 0)
if '*' in reg_element:
weight, name = reg_element[1:-1].split('*')
regulator_list = regulator_list + get_element(name, 1)
else:
regulator_list = regulator_list + get_element(reg_element, 1)
elif reg_element[0] == '{' and reg_element[-1] == ']':
# This is a necessary pair
# check the point between {} and []
parentheses = 0
cutpoint = 0
for index, char in enumerate(reg_element):
if char == '{':
parentheses += 1
elif char == '}':
parentheses -= 1
if parentheses == 0:
cutpoint = index
break
necessary_element = reg_element[1: cutpoint]
enhence_element = reg_element[cutpoint + 2:-1]
if '*' in necessary_element:
weight, name = necessary_element.split('*')
regulator_list = regulator_list + get_element(name, 1)
else:
regulator_list = regulator_list + get_element(necessary_element, 1)
if '*' in enhence_element:
weight, name = enhence_element.split('*')
regulator_list = regulator_list + get_element(name, 1)
else:
regulator_list = regulator_list + get_element(enhence_element, 1)
elif reg_element[0] == '(' and reg_element[-1] == ')':
_list = [element for ele_list in split_comma_out_parentheses(reg_element[1:-1])
for element in get_element(ele_list, 1)]
regulator_list += _list
else:
assert (',' not in reg_element)
if reg_element[-1] == '^':
regulator_list.append(reg_element[0:-1])
elif '&' in reg_element:
regulator_list.append(reg_element[1:-1])
elif '*' in reg_element:
multiply_reg_list = reg_element.split('*')
for reg_ in multiply_reg_list:
if re.search(r'^[0-9]', reg_):
pass
elif not re.search(r'[a-zA-Z0-9_!]+', reg_):
pass
else:
regulator_list.append(reg_)
elif reg_element[0] == '!':
if '~' in reg_element[1:]:
delay, reg_delay = reg_element[1:].split('~')
regulator_list.append(reg_delay)
else:
regulator_list.append(reg_element[1:])
elif '=' in reg_element:
name, target_state = reg_element.split('=')
regulator_list.append(target_state)
elif '~' in reg_element:
delay, state = reg_element.split('~')
regulator_list.append(state)
else:
regulator_list.append(reg_element)
return regulator_list
def split_comma_out_parentheses(reg_rule: str) -> list:
"""
This function split the parentheses by comma outside of parentheses. e.g. '(A,B),(C,B)' -> ['(A,B)','(C,B)'].
Parameters
----------
reg_rule: str
A regulation rule.
Returns
-------
reg_list: list
A list of expressions that are separated by brackets.
"""
reg_list = list()
parentheses = 0
start = 0
for index, char in enumerate(reg_rule):
if index == len(reg_rule) - 1:
reg_list.append(reg_rule[start:index + 1])
elif char == '(' or char == '{' or char == '[':
parentheses += 1
elif char == ')' or char == '}' or char == ']':
parentheses -= 1
elif char == ',' and parentheses == 0:
reg_list.append(reg_rule[start:index])
start = index + 1
return reg_list
def wrap_list_to_str(df: pd.DataFrame, cols: list) -> pd.DataFrame:
"""
This function wraps the lists in the output dataframe to strings.
Parameters
----------
df: pd.DataFrame
A output of dataframe.
cols: list
A list of columns name.
Returns
-------
df: pd.DataFrame
"""
for row in range(len(df)):
for col in cols:
df.loc[row, col] = ','.join(list(df.loc[row, col]))
return df
[docs]def get_listname(idx: int, model_df: pd.DataFrame) -> str:
"""
Create the listnames by element attributes.
This function generates unique identifiers for elements in the model network using the rules:
- listname: {element_name}_{element_type}_{element_subtype}_{compartment_ID}
- For the elements have multiple types and subtypes, the identifier only include the first entry.
- If any attribute is empty, it is replaced with 'nan' in the list name.
These unique identifiers are then used by VIOLIN for further manipulation of the network information.
Parameters
----------
idx: int
the row index of element in the model file.
model_df: pd.DataFrame
A dataframe of a model.
Returns
-------
listname: str
A formatted name for regulator list column.
"""
ele_col_list = ['Element Name', 'Element Type', 'Element Subtype', 'Compartment ID']
model_df[ele_col_list] = model_df[ele_col_list].apply(lambda x: x.astype(str).str.lower())
if str(model_df.loc[idx, 'Element Type']).replace(' ', '') not in TYPE_ABBR_DICT:
ele_type = model_df.loc[idx, 'Element Type'].replace(' ', '')
else:
ele_type = TYPE_ABBR_DICT[model_df.loc[idx, 'Element Type'].replace(' ', '')]
listname = '{}_{}_{}_{}'.format(
model_df.loc[idx, 'Element Name'],
ele_type,
get_subtype_abbr(model_df.loc[idx, 'Element Subtype']),
model_df.loc[idx, 'Compartment ID'].replace(':', '')
)
return listname
def get_subtype_abbr(subtype: str) -> str:
"""
A function to encode the subtype into a identifier pattern in a listname of an element.
Parameters
----------
subtype: str
A subtype of the element.
Returns
-------
abbr: str
A abbreviation of the first subtype.
"""
list_ = []
# FIXME: Only get first subtype (TBD for the other subtypes)
subtype = subtype.split(',')[0]
if subtype not in ['', 'nan']:
for x in [subname for subname in subtype.replace('(', ' ').replace(')', ' ').split(' ') if
subname not in ['', ' ']]:
if x.lower() not in SUBTYPE_ABBR_DICT.keys():
list_.append(x.strip())
else:
list_.append(SUBTYPE_ABBR_DICT[x.lower().strip()])
abbr = ''.join(list_)
else:
abbr = 'nan'
return abbr
# TODO: implement with functionality and integrate with BioRECIPE
# TODO: implement the functionality and integrate with BioRECIPE