Plotting lineage persistence with Bokeh

March 02, 2020

These plots can be useful for exploring trends in infectious disease outbreaks over time. In some recent work on bugs growing in hospital sinks, I used the one below to help show that sink drains are colonised by a handful of E. coli lineages, which occasionally overlap with infections seen in patients staying on the same wards. This is a small dataset, but the interactivity of Bokeh is proving useful for exploring a larger version of this dataset, where room for annotations is limited. The code below clusters a dataframe of SNP distances (produced here from a recombination adjusted SNP phylogeny), and uses a dataframe of sampling dates to produce a plot much like the one below.

Bokeh Plot

# Python 3.7 with pandas, bokeh, scipy and treeswift
import pandas as pd
import treeswift as ts

from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import fcluster, linkage

from bokeh.models import DatetimeTickFormatter, HoverTool, LabelSet
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.transform import factor_cmap, factor_mark
from bokeh.io import output_notebook, export_svgs
from bokeh.models.ranges import DataRange1d

wd = 'res/2020-03-02'  # working directory

output_notebook()

Load metadata

meta_df = pd.read_csv(f'{wd}/in/meta.tsv', sep='\t', index_col=0, parse_dates=True)
meta_df['date'] = pd.to_datetime(meta_df['date'])
meta_df.head()

	name	date	ward	sink	timepoint	type	mlst_unknown
guid
5b6f3b49-81d1-4bda-93e8-b8639d15e332	pi_ec_9446323_UC_7B_7B_26/03/2017	2017-03-26	GM	NaN	NaN	patient	ecoli 144
ecb5f5c6-eba4-45d8-b32b-9a03a88cec1c	pi_ec_1262305_UC_EAU_EAU_27/03/2017	2017-03-27	AA	NaN	NaN	patient	ecoli 127
fec51366-f5a9-4ad4-882d-1a16b711f460	pi_ec_1801612_UC_EAU_7C_01/04/2017	2017-04-01	AA	NaN	NaN	patient	ecoli 131
7e7500f3-f89c-4769-9054-00c705cf3168	pi_ec_1269762_UC_EAU_SSIP_14/04/2017	2017-04-14	AA	NaN	NaN	patient	ecoli 131
e69dd351-d716-41b2-bf6b-78e22b1cb461	pi_ec_1957587_UC_EAU_EAU_14/04/2017	2017-04-14	AA	NaN	NaN	patient	ecoli 73

Load core phylogeny & fetch recombination adjusted SNP distances

tree = (ts.read_tree_newick(f'{wd}/in/cluster_1_cf_scaled.guids.tree')
        .extract_tree_without({'reference'}))  # Get Tree without reference sequence
distances = tree.distance_matrix(leaf_labels=True)  # Pairwise distances as dict
distances_df = (pd.DataFrame(distances)   # Pairwise distances as square dataframe
                .sort_index(axis=0)
                .sort_index(axis=1)
                .fillna(0)
                .astype(int))

distances_df.iloc[:4,:4]

	02080367-872a-47fa-9412-a3eefb7cfb86	03297834-8d15-44e4-bd0d-38a0e234f8b5	03d85ba8-d254-4188-86bc-e97cf1f3c7c7	03ee2b1b-5658-4dad-a933-9bc457654c18
02080367-872a-47fa-9412-a3eefb7cfb86	0	26	67	21265
03297834-8d15-44e4-bd0d-38a0e234f8b5	26	0	47	21245
03d85ba8-d254-4188-86bc-e97cf1f3c7c7	67	47	0	21242
03ee2b1b-5658-4dad-a933-9bc457654c18	21265	21245	21242	0

Cluster isolates by SNP distance

snp_threshold = 100
distances_cnd = squareform(distances_df)
clusters = fcluster(linkage(distances_cnd, metric='precomputed'),
                    criterion='distance',
                    t=snp_threshold)
names_clusters = {g:c for g, c in zip(distances_df.columns, clusters)}

(Alternatively, one could use e.g. DBSCAN for clustering)

from sklearn.cluster import DBSCAN
clustering = (DBSCAN(eps=100, min_samples=1, metric='precomputed')
              .fit(distances_df.values))
clusters = clustering.labels_
names_clusters = {n: c for n, c in zip(distances_df.columns, clusters)}

Reorder clusters by date of appearance

clusters_meta_df = meta_df.copy()
clusters_meta_df['cluster'] = clusters_meta_df.index.map(names_clusters)
clusters_meta_df.sort_values('date', inplace=True)

clusters_newclusters = {}
cluster_appearance = [r['cluster'] for r in clusters_meta_df.to_dict('records')]
i = 0
for c in cluster_appearance:
    if c not in clusters_newclusters:
        clusters_newclusters[c] = i
        i += 1
clusters_meta_df['cluster_new'] = clusters_meta_df['cluster'].map(clusters_newclusters)

First try

p = figure()

p.scatter('date',
          'cluster_new',
          source=ColumnDataSource(clusters_meta_df),
          legend_group='ward',
          fill_alpha=0.25,
          line_width=1.5,
          size=9,
          marker=factor_mark('type', ('square', 'circle'), ('patient', 'sink')),
          color=factor_cmap('ward', 'Category10_7',
                            sorted(clusters_meta_df.ward.unique())))

p.xaxis.formatter = DatetimeTickFormatter(days = ['%Y-%m-%d'])

show(p)

Bokeh Plot

With cluster lifetimes

# Remove redundant clusters cultured from the same sink-timepoint
clusters_meta_sinks_nr_df = clusters_meta_df.query("type == 'sink'").drop_duplicates(['sink', 'cluster_new', 'date'])
clusters_meta_patients_df = clusters_meta_df.query("type == 'patient'")
clusters_meta_df = pd.concat([clusters_meta_sinks_nr_df, clusters_meta_patients_df])

clusters_startdates = {}
for r in clusters_meta_df.to_dict('records'):
    if r['cluster_new'] in clusters_startdates:
        if r['date'] < clusters_startdates[r['cluster_new']]:
            clusters_startdates[r['cluster_new']] = r['date']
    else:
        clusters_startdates[r['cluster_new']] = r['date']
clusters_enddates = {}
for r in clusters_meta_df.to_dict('records'):
    if r['cluster_new'] in clusters_enddates:
        if r['date'] > clusters_enddates[r['cluster_new']]:
            clusters_enddates[r['cluster_new']] = r['date']
    else:
        clusters_enddates[r['cluster_new']] = r['date']
clusters_lifetimes = {c: (sd, clusters_enddates[c])
                      for c, sd in clusters_startdates.items()}
clusters_lifetimes_len = {c: ed-sd for c, (sd, ed) in clusters_lifetimes.items()}
clusters_meta_df['cluster_lifetime'] = clusters_meta_df['cluster_new'].map(clusters_lifetimes_len)


p = figure()

p.scatter('date',
          'cluster_new',
          source=ColumnDataSource(clusters_meta_df),
          legend_group='ward',
          fill_alpha=0.25,
          line_width=1.5,
          size=9,
          marker=factor_mark('type', ('square', 'circle'), ('patient', 'sink')),
          color=factor_cmap('ward', 'Category10_7',
          sorted(clusters_meta_df.ward.unique())))

p.xaxis.formatter = DatetimeTickFormatter(days = ['%Y-%m-%d'])

for c, l in clusters_lifetimes.items():
    p.line(l, [c,c], line_width=1, line_color='gray')

show(p)

Bokeh Plot

With labels and tooltips

tooltips = [('Name', '@name'),
            ('Location', '@ward @sink'),
            ('Date', '@date{%F}'),
            ('Cluster', '@cluster (@mlst_unknown)')]

hover = HoverTool(names=['main'],  # Needed to format datetimes in tooltips
                  formatters={"date": "datetime"})

# Avoid clipping top-rightmost ST labels
x_range = DataRange1d(range_padding=0.2,
                      start=1488412800000,
                      range_padding_units='percent')

p = figure(plot_width=600, plot_height=700, x_range=x_range,
           title='E. coli lineage persistence',
           tooltips=tooltips,
           tools=[hover,'save'])

p.scatter('date',
          'cluster_new',
          source=ColumnDataSource(clusters_meta_df),
          name='main',
          legend_group='ward',
          fill_alpha=0.25,
          line_width=1.5,
          size=9,
          marker=factor_mark('type', ('square', 'circle'), ('patient', 'sink')),
          color=factor_cmap('ward', 'Category10_7',
                             sorted(clusters_meta_df.ward.unique())))

for c, l in clusters_lifetimes.items():
    p.line(l, [c,c], line_width=1, line_color='gray')

labels = LabelSet(x='date', y='cluster_new',
                  text='mlst_unknown',
                  level='glyph',
                  text_font_size="7pt",
                  x_offset=7, y_offset=-5,
                  source=(ColumnDataSource(clusters_meta_df.sort_values('date')
                          .drop_duplicates(['cluster_new'], keep='last'))))
p.add_layout(labels)

p.xaxis.formatter = DatetimeTickFormatter(days = ['%Y-%m-%d'])
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Core SNP cluster'
p.legend.location = "top_left"
p.xgrid.visible = False
p.ygrid.visible = False
p.toolbar.logo = None
p.toolbar_location = None

show(p)

Bokeh Plot

And we arrive where we began, excepting some ugly legend hacks which I’m too embarrassed to share. Let me know if you encounter issues, and please cite our paper if this helps your science! https://doi.org/10.1099/mgen.0.000391