Browse Source

basic nw sequence alignment

Alexander Rose 5 years ago
parent
commit
d8b9a1a560

+ 0 - 0
src/mol-model/sequence/alignment.ts


+ 21 - 0
src/mol-model/sequence/alignment/_spec/alignment.spec.ts

@@ -0,0 +1,21 @@
+/**
+ * Copyright (c) 2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import { align } from '../alignment';
+
+describe('Alignment', () => {
+    it('basic', () => {
+        // 3PQR: Rhodopsin
+        const seq1 = 'MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA';
+        // 3SN6: Endolysin,Beta-2 adrenergic
+        const seq2 = 'DYKDDDDAENLYFQGNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYAADEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILTKTWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNTGEQSG';
+        const { ali1, ali2, score } = align(seq1, seq2);
+
+        expect(ali1).toEqual('------------------------------------------------------------------------------------------------------------------------------------------------MNGTEGPNFYVPFSNKTGVVRSPFEA---PQYYLAEPWQFSM--LAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLH---GYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMS-NFRFGENHAIMGVAFTWVMA-LACAAPPLVGWSRYI-PEGMQC----SCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLV----------------FTVKEAAAQQQESATTQ----------KAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA');
+        expect(ali2).toEqual('DYKDDDDAENLYFQGNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKS-RWYNQTPNRAKRVITTFRTGTWDAYAADEVWVVGMGIVMSLIVLAIVFG---NVLVITAIAKFERLQTVTNYFITSLACADLVM---GLAVVPFGAAHILTKTWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETC-CDFFT------NQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMG-TFTLCWLPFF-IVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIY-CRSPDFRIAFQELLCLRRSSL--KAYGNGYSSNGNTGEQSG');
+        expect(score).toEqual(118);
+    });
+});

+ 219 - 0
src/mol-model/sequence/alignment/alignment.ts

@@ -0,0 +1,219 @@
+/**
+ * Copyright (c) 2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import { SubstitutionMatrix, SubstitutionMatrices, SubstitutionMatrixData } from './substitution-matrix';
+
+const DefaultAlignmentOptions = {
+    gapPenalty: -11,
+    gapExtensionPenalty: -1,
+    substMatrix: 'blosum62' as SubstitutionMatrix
+};
+export type AlignmentOptions = typeof DefaultAlignmentOptions;
+
+export function align(seq1: string, seq2: string, options: Partial<AlignmentOptions> = {}) {
+    const o = { ...DefaultAlignmentOptions, ...options };
+    const alignment = new Alignment(seq1, seq2, o);
+    alignment.calculate();
+    alignment.trace();
+    return {
+        ali1: alignment.ali1,
+        ali2: alignment.ali2,
+        score: alignment.score,
+    };
+}
+
+class Alignment {
+    gapPenalty: number; gapExtensionPenalty: number
+    substMatrix: SubstitutionMatrixData
+
+    n: number; m: number
+    S: number[][]; V: number[][]; H: number[][]
+    ali1: string; ali2: string;
+    score: number
+
+    constructor (readonly seq1: string, readonly seq2: string, options: AlignmentOptions) {
+        this.gapPenalty = options.gapPenalty;
+        this.gapExtensionPenalty = options.gapExtensionPenalty;
+        this.substMatrix = SubstitutionMatrices[options.substMatrix];
+
+        this.n = this.seq1.length;
+        this.m = this.seq2.length;
+    }
+
+    private initMatrices () {
+        this.S = [];
+        this.V = [];
+        this.H = [];
+
+        for (let i = 0; i <= this.n; ++i) {
+            this.S[i] = [];
+            this.V[i] = [];
+            this.H[i] = [];
+
+            for (let j = 0; j <= this.m; ++j) {
+                this.S[i][j] = 0;
+                this.V[i][j] = 0;
+                this.H[i][j] = 0;
+            }
+        }
+
+        for (let i = 0; i <= this.n; ++i) {
+            this.S[i][0] = this.gap(0);
+            this.H[i][0] = -Infinity;
+        }
+
+        for (let j = 0; j <= this.m; ++j) {
+            this.S[0][j] = this.gap(0);
+            this.V[0][j] = -Infinity;
+        }
+
+        this.S[0][0] = 0;
+    }
+
+    private gap (len: number) {
+        return this.gapPenalty + len * this.gapExtensionPenalty;
+    }
+
+    private makeScoreFn () {
+        const seq1 = this.seq1;
+        const seq2 = this.seq2;
+
+        const substMatrix = this.substMatrix;
+
+        if (substMatrix) {
+            return function score (i: number, j: number) {
+                const c1 = seq1[i];
+                const c2 = seq2[j];
+
+                try {
+                    return substMatrix[c1][c2];
+                } catch (e) {
+                    return -4;
+                }
+            };
+        } else {
+            return function scoreNoSubstMat (i: number, j: number) {
+                const c1 = seq1[i];
+                const c2 = seq2[j];
+
+                return c1 === c2 ? 5 : -3;
+            };
+        }
+    }
+
+    calculate () {
+        this.initMatrices();
+
+        const gap0 = this.gap(0);
+        const scoreFn = this.makeScoreFn();
+        const { V, H, S, n, m, gapExtensionPenalty } = this;
+
+        let Vi1, Si1, Vi, Hi, Si;
+        for (let i = 1; i <= n; ++i) {
+            Si1 = S[i - 1], Vi1 = V[i - 1];
+            Vi = V[i], Hi = H[i], Si = S[i];
+
+            for (let j = 1; j <= m; ++j) {
+                Vi[j] = Math.max(
+                    Si1[j] + gap0,
+                    Vi1[j] + gapExtensionPenalty
+                );
+
+                Hi[j] = Math.max(
+                    Si[j - 1] + gap0,
+                    Hi[j - 1] + gapExtensionPenalty
+                );
+
+                Si[j] = Math.max(
+                    Si1[j - 1] + scoreFn(i - 1, j - 1), // match
+                    Vi[j], // del
+                    Hi[j]  // ins
+                );
+            }
+        }
+    }
+
+    trace () {
+        this.ali1 = '';
+        this.ali2 = '';
+
+        const scoreFn = this.makeScoreFn();
+
+        let i = this.n;
+        let j = this.m;
+        let mat: 'S' | 'V' | 'H';
+
+        if (this.S[i][j] >= this.V[i][j]) {
+            mat = 'S';
+            this.score = this.S[i][j];
+        } else if (this.V[i][j] >= this.H[i][j]) {
+            mat = 'V';
+            this.score = this.V[i][j];
+        } else {
+            mat = 'H';
+            this.score = this.H[i][j];
+        }
+
+        while (i > 0 && j > 0) {
+            if (mat === 'S') {
+                if (this.S[i][j] === this.S[i - 1][j - 1] + scoreFn(i - 1, j - 1)) {
+                    this.ali1 = this.seq1[i - 1] + this.ali1;
+                    this.ali2 = this.seq2[j - 1] + this.ali2;
+                    --i;
+                    --j;
+                    mat = 'S';
+                } else if (this.S[i][j] === this.V[i][j]) {
+                    mat = 'V';
+                } else if (this.S[i][j] === this.H[i][j]) {
+                    mat = 'H';
+                } else {
+                    --i;
+                    --j;
+                }
+            } else if (mat === 'V') {
+                if (this.V[i][j] === this.V[i - 1][j] + this.gapExtensionPenalty) {
+                    this.ali1 = this.seq1[i - 1] + this.ali1;
+                    this.ali2 = '-' + this.ali2;
+                    --i;
+                    mat = 'V';
+                } else if (this.V[i][j] === this.S[i - 1][j] + this.gap(0)) {
+                    this.ali1 = this.seq1[i - 1] + this.ali1;
+                    this.ali2 = '-' + this.ali2;
+                    --i;
+                    mat = 'S';
+                } else {
+                    --i;
+                }
+            } else if (mat === 'H') {
+                if (this.H[i][j] === this.H[i][j - 1] + this.gapExtensionPenalty) {
+                    this.ali1 = '-' + this.ali1;
+                    this.ali2 = this.seq2[j - 1] + this.ali2;
+                    --j;
+                    mat = 'H';
+                } else if (this.H[i][j] === this.S[i][j - 1] + this.gap(0)) {
+                    this.ali1 = '-' + this.ali1;
+                    this.ali2 = this.seq2[j - 1] + this.ali2;
+                    --j;
+                    mat = 'S';
+                } else {
+                    --j;
+                }
+            }
+        }
+
+        while (i > 0) {
+            this.ali1 = this.seq1[i - 1] + this.ali1;
+            this.ali2 = '-' + this.ali2;
+            --i;
+        }
+
+        while (j > 0) {
+            this.ali1 = '-' + this.ali1;
+            this.ali2 = this.seq2[j - 1] + this.ali2;
+            --j;
+        }
+    }
+}

+ 81 - 0
src/mol-model/sequence/alignment/substitution-matrix.ts

@@ -0,0 +1,81 @@
+/**
+ * Copyright (c) 2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import { Mutable } from '../../../mol-util/type-helpers';
+
+const aminoacidsX = 'ACDEFGHIKLMNPQRSTVWY';
+const aminoacids = 'ARNDCQEGHILKMFPSTWYVBZ?';
+
+const blosum62x = [
+    [4, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, -2],        // A
+    [0, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2],    // C
+    [-2, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -3],       // D
+    [-1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -2],          // E
+    [-2, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, 3],        // F
+    [0, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -3],      // G
+    [-2, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, 2],        // H
+    [-1, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1],       // I
+    [-1, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -2],        // K
+    [-1, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1],       // L
+    [-1, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1],        // M
+    [-2, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2],            // N
+    [-1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -3],   // P
+    [-1, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1],           // Q
+    [-1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -2],        // R
+    [1, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, -2],           // S
+    [0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, -2],       // T
+    [0, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1],        // V
+    [-3, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, 2],    // W
+    [-2, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, 7]       // Y
+];
+
+const blosum62 = [
+    // A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X
+    [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0], // A
+    [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1], // R
+    [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1], // N
+    [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1], // D
+    [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2], // C
+    [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1], // Q
+    [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1], // E
+    [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1], // G
+    [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1], // H
+    [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1], // I
+    [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1], // L
+    [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1], // K
+    [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1], // M
+    [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1], // F
+    [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2], // P
+    [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0], // S
+    [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0], // T
+    [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2], // W
+    [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1], // Y
+    [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1], // V
+    [-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1], // B
+    [-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1], // Z
+    [0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1]  // X
+];
+
+export type SubstitutionMatrixData = Readonly<{ [k: string]: Readonly<{ [k: string]: number }> }>;
+
+function prepareMatrix (cellNames: string, mat: number[][]): SubstitutionMatrixData {
+    let j: number;
+    let i = 0;
+    const matDict: Mutable<SubstitutionMatrixData> = {};
+    mat.forEach(row => {
+        j = 0;
+        const rowDict: { [k: string]: number } = {};
+        row.forEach(elm => rowDict[cellNames[j++]] = elm);
+        matDict[cellNames[i++]] = rowDict;
+    });
+    return matDict;
+}
+
+export const SubstitutionMatrices = (() => ({
+    blosum62: prepareMatrix(aminoacids, blosum62),
+    blosum62x: prepareMatrix(aminoacidsX, blosum62x)
+}))();
+export type SubstitutionMatrix = keyof typeof SubstitutionMatrices;

+ 0 - 1
src/mol-model/sequence/sequence.ts

@@ -9,7 +9,6 @@ import { AminoAlphabet, NuclecicAlphabet, getProteinOneLetterCode, getRnaOneLett
 import { Column } from '../../mol-data/db';
 
 // TODO add mapping support to other sequence spaces, e.g. uniprot
-// TODO sequence alignment (take NGL code as starting point)
 
 type Sequence = Sequence.Protein | Sequence.DNA | Sequence.RNA | Sequence.Generic