loadfonts(device = "postscript", quiet = TRUE)

tie_adjustment <- function(ranks) {
  Table <- table(ranks)
  N <- length(ranks)
  return((N + 1) + sum((Table ^ 3 - Table) / (N * (N - 1))))

grouped_mann_whitney_test <- function(DataFrame,group_var,value_var,sample_var) {
  subset_df <- DataFrame %>% 
  colnames(subset_df) <- c("group","x","sample")
  Ns <- subset_df %>%
    group_by(group,sample) %>%
    summarise(n = length(x)) %>% 
  Ns_renamed <- Ns
  colnames(Ns_renamed) <- c("group",seq(1,ncol(Ns_renamed)-1))
  mannwhit_sum_of_ranks <- subset_df %>%
    group_by(group) %>%
    mutate(ranks = rank(x)) %>%
  n_ties <- mannwhit_sum_of_ranks %>%
    group_by(group) %>%
    summarise(tie_adj = tie_adjustment(ranks))
  colnames(n_ties) <- c("group",seq(1,ncol(n_ties)-1))
  mannwhit_sum_of_ranks <- mannwhit_sum_of_ranks %>%
    summarise(sum_of_ranks = sum(x) - length(x) * (1 + length(x)) / 2) %>%
  colnames(mannwhit_sum_of_ranks) <- c("group",seq(1,ncol(mannwhit_sum_of_ranks)-1))
  comb_sam <- colnames(mannwhit_sum_of_ranks)[2:ncol(mannwhit_sum_of_ranks)] %>%
    combn(2) %>% 
  output <- list()
  tie_adj <- n_ties[,2]
  for (i in 1:nrow(comb_sam)) {
    sam <- comb_sam[i,]
    U_values <- apply(mannwhit_sum_of_ranks[,sam],1,min)
    mu_U <- Ns_renamed[,sam[1]] * Ns_renamed[,sam[2]] / 2
    sigma_U <- sqrt((2 * mu_U / 12) * tie_adj)
    p_values <- unlist((U_values - mu_U)/sigma_U) %>%
      pnorm(log.p = T)
    output[[i]] <- c(colnames(Ns)[as.numeric(sam) + 1],p_values / log(10))}
  output <- output %>% = rbind) 
  colnames(output) <- c("Sample1","Sample2",
  output %>% 
    as.tibble() %>% gather("key","log10.pvalue",-Sample1,-Sample2) %>% 

COLORS <- c(HS = "#CAB0B0",
            `Interface Rim HS` = "#784444",
            `Interface Support HS` = "#CAB0B0",
            `Interface Core HS` = "#FFDEDE",
            NS = "salmon3",
            `Interface Rim NS` = "#80635E",
            `Interface Support NS` = "salmon3",
            `Interface Core NS` = "#FF8370",
            Core = "#2F2504",
            Interface = "#9DC4CC",
            `Interface Rim` = "#4F6367",
            `Interface Support` = "#9DC4CC",
            `Interface Core` = "#78E8FF",
            Surface = "#442F38")

res_names <- c("Arg","Lys","Asn","Asp",
               "Val","Ile") %>%

p_values <- list(
  classifier = list(),
  hs = list(),
  interface = list(),
  interface_hs = list()

theme_update(axis.text.x = element_text(face="bold", size=16, angle=90, hjust = 2,vjust = 0.5),
             axis.title.x = element_blank(),axis.title.y=element_blank(),
             axis.text.y = element_text(face="bold",  size=16),
             panel.grid.major.y = element_blank(),
             panel.grid.major.x = element_line(size = 1, 
                                          linetype = 'dotted',
                                          colour = "gray60"),
             panel.grid.minor = element_blank(),
             panel.border = element_blank(),
             panel.background = element_blank(),
             strip.text = element_blank(),
             text = element_text(family="Roboto Mono"),
             legend.text = element_text(size=16),
             legend.background = element_rect(linetype = "dotted",
                                         color = "gray60",
                                         size = 0.5,
                                         fill = "#CAB0B0"),
             legend.position = "top")

1 Scientific Context

Protein-protein complexes are fundamental for a variety of biological functions ranging from hormone-receptor interactions to innate immunity. One of the key features of a Protein-Protein Interface (PPI) is its rich and diverse energetic landscape, featuring sites of high importance for complex formation (Hot-Spots – HS), little importance (Null-Spots – NS) and, more recently, sites which are thought to be deleterious to protein formation, Cold-Spots (CS).

Alanine-Scanning Mutagenesis (ASM), which consists in the systematic mutagenesis of a PPI, is the typical method of choice to characterize a binding interface. ASM uses protein engineering to replace interfacial residues by alanine, a hydrophobic amino acid, and compares the difference in the binding free energy (ΔΔGbinding) between Wild-Type (WT) and mutant complex. HS have been defined as those residues whose mutation to alanine results in a binding free energy difference ΔΔGbinding >= 2.0 kcal/mol whereas NS as the ones with a ΔΔGbinding lower than 2.0 kcal/mol, and CS as the positions where three or more different substitutions lead to at least a 0.3 kcal/mol increase in the free binding energy. In the last few years, we published a HS detection method, SpotOn, with an exceptional performance in identifying HS on an independent test set (95% accuracy and 95% sensitivity).

This is an online notebook where you can find a detailed analysis of the used data in our publication Hot or not so Hot Spots? Using HOOKED, a data-driven approach, to improve the understanding of binding motifs in protein-protein interactions. We systematically characterized a large set of non-redundant PPI using the non-redundant PPI4DOCK database, providing a big-data perspective on PPIs while simultaneously predicting HS with a method with the highest reported accuracy. Our paper aimed to understand and characterize the HS and NS distribution within a few distinct groups of residues - protein core residues, surface residues and interface residues. Within the latter, we focused on the rim, support and core of the interface (as per (Levy et al.) and on HS and NS as predicted by SpotOn.

2 Loading and merging data from different sources

The final dataset comprises 1228 protein-protein complexes (1326 different interfaces, accounting with more than one interface by complexes) retrieved from the PPI4DOCK database for a total of 55113 interfacial residues.

Our analysis resulted from the combination of four different in house pipelines - the SpotOn pipeline for HS prediction, the MENSA pipeline for protein complex characterisation, a pipeline to calculate the solvent accessible surface area and fluctuations in both complex and modelled monomeric form, and a post hoc pipeline to calculate the number of neighboring residues and HS within 5-10 \(\overset{\circ}{\mathrm {A}}\). As such, all data coming from the different platforms was initially merged. More details can be found on our publication.