Browse Source

wip, combine-mmcif script

Alexander Rose 7 years ago
parent
commit
f1a4053cb4
1 changed files with 162 additions and 48 deletions
  1. 162 48
      src/apps/combine-mmcif/index.ts

+ 162 - 48
src/apps/combine-mmcif/index.ts

@@ -7,17 +7,20 @@
 import * as argparse from 'argparse'
 import * as util from 'util'
 import * as fs from 'fs'
+import * as zlib from 'zlib'
 import fetch from 'node-fetch'
 require('util.promisify').shim();
-const readFileAsync = util.promisify(fs.readFile);
+const readFile = util.promisify(fs.readFile);
+const writeFile = util.promisify(fs.writeFile);
 
-import { DatabaseCollection, Database, Table } from 'mol-data/db'
+import { Database, Table, DatabaseCollection } from 'mol-data/db'
 import CIF from 'mol-io/reader/cif'
 // import { CCD_Schema } from 'mol-io/reader/cif/schema/ccd'
 import * as Encoder from 'mol-io/writer/cif'
 import Computation from 'mol-util/computation'
-import { mmCIF_Schema } from 'mol-io/reader/cif/schema/mmcif';
+import { mmCIF_Schema, mmCIF_Database } from 'mol-io/reader/cif/schema/mmcif';
 import { CCD_Schema } from 'mol-io/reader/cif/schema/ccd';
+import { BIRD_Schema } from 'mol-io/reader/cif/schema/bird';
 // import { Table } from 'mol-io/reader/csv/data-model';
 
 // import { Model, Structure } from 'mol-model/structure'
@@ -30,7 +33,11 @@ export async function ensureAvailable(path: string, url: string) {
         if (!fs.existsSync(DATA_DIR)) {
             fs.mkdirSync(DATA_DIR);
         }
-        fs.writeFileSync(path, await data.text())
+        if (url.endsWith('.gz')) {
+            await writeFile(path, zlib.gunzipSync(await data.buffer()))
+        } else {
+            await writeFile(path, await data.text())
+        }
         console.log(`done downloading ${url}`)
     }
 }
@@ -38,30 +45,28 @@ export async function ensureAvailable(path: string, url: string) {
 export async function ensureDataAvailable() {
     await ensureAvailable(CCD_PATH, CCD_URL)
     await ensureAvailable(PVCD_PATH, PVCD_URL)
+    await ensureAvailable(BIRD_PATH, BIRD_URL)
 }
 
 function showProgress(tag: string, p: Computation.Progress) {
     console.log(`[${tag}] ${p.message} ${p.isIndeterminate ? '' : (p.current / p.max * 100).toFixed(2) + '% '}(${p.elapsedMs | 0}ms)`)
 }
 
-export async function readCCD(bcif = false) {
-    const parsed = await parseCif(await readFileAsync(CCD_PATH, 'utf8'))
-    const ccd: DatabaseCollection<CCD_Schema> = {}
-    for (const data of parsed.result.blocks) {
-        // console.log(data.header)
-        ccd[data.header] = CIF.schema.CCD(data)
-    }
-    return ccd;
+export async function readFileAsCollection<S extends Database.Schema>(path: string, schema: S) {
+    const parsed = await parseCif(await readFile(path, 'utf8'))
+    return CIF.toDatabaseCollection(schema, parsed.result)
 }
 
-export async function readPVCD(bcif = false) {
-    const parsed = await parseCif(await readFileAsync(PVCD_PATH, 'utf8'))
-    const ccd: DatabaseCollection<CCD_Schema> = {}
-    for (const data of parsed.result.blocks) {
-        // console.log(data.header)
-        ccd[data.header] = CIF.schema.CCD(data)
-    }
-    return ccd;
+export async function readCCD() {
+    return readFileAsCollection(CCD_PATH, CCD_Schema)
+}
+
+export async function readPVCD() {
+    return readFileAsCollection(PVCD_PATH, CCD_Schema)
+}
+
+export async function readBIRD() {
+    return readFileAsCollection(BIRD_PATH, BIRD_Schema)
 }
 
 export async function getCCD() {
@@ -69,6 +74,11 @@ export async function getCCD() {
     return readPVCD()
 }
 
+export async function getBIRD() {
+    await ensureDataAvailable()
+    return readBIRD()
+}
+
 async function parseCif(data: string|Uint8Array) {
     const comp = CIF.parse(data)
     const ctx = Computation.observable({
@@ -97,53 +107,157 @@ export async function getPdb(pdb: string) {
     return CIF.schema.mmCIF(parsed.result.blocks[0])
 }
 
+type extraTables = {
+    chem_comp_bond: Table<mmCIF_Schema['chem_comp_bond']>,
+    pdbx_reference_entity_list: Table<mmCIF_Schema['pdbx_reference_entity_list']>,
+    pdbx_reference_entity_link: Table<mmCIF_Schema['pdbx_reference_entity_link']>,
+    pdbx_reference_entity_poly_link: Table<mmCIF_Schema['pdbx_reference_entity_poly_link']>,
+}
+type extraTablesLists = {
+    [k in keyof extraTables]: extraTables[k][]
+}
+
+export function getExtraTables(mmcif: mmCIF_Database, ccd: DatabaseCollection<CCD_Schema>, bird: DatabaseCollection<BIRD_Schema>) {
+    const extraTablesLists: extraTablesLists = {
+        chem_comp_bond: [],
+        pdbx_reference_entity_list: [],
+        pdbx_reference_entity_link: [],
+        pdbx_reference_entity_poly_link: []
+    }
+
+    for (let i = 0, n = mmcif.chem_comp._rowCount; i < n; ++i) {
+        const ccdId = mmcif.chem_comp.id.value(i);
+        if (ccdId in ccd) {
+            extraTablesLists.chem_comp_bond.push(ccd[ccdId].chem_comp_bond)
+        } else {
+            console.error(`ccdId ${ccdId} not found`)
+        }
+    }
+
+    for (let i = 0, n = mmcif.pdbx_molecule_features._rowCount; i < n; ++i) {
+        const birdId = mmcif.pdbx_molecule_features.prd_id.value(i);
+        if (birdId in bird) {
+            const e = bird[birdId]
+            extraTablesLists.pdbx_reference_entity_list.push(e.pdbx_reference_entity_list)
+            extraTablesLists.pdbx_reference_entity_link.push(e.pdbx_reference_entity_link)
+            extraTablesLists.pdbx_reference_entity_poly_link.push(e.pdbx_reference_entity_poly_link)
+        } else {
+            console.error(`birdId ${birdId} not found`)
+        }
+    }
+
+    const extraTables: extraTables = Object.assign({}, ...Object.keys(extraTablesLists).map(k => {
+        // TODO how to avoid type casting?
+        return { [k]: Table.concat((extraTablesLists as any)[k], (mmCIF_Schema as any)[k]) }
+    }))
+
+    return extraTables
+}
+
+type PartialStructConnRow = Partial<Table.Row<mmCIF_Schema['struct_conn']>>
+
+export function getStructConnValueOrder(value: any): mmCIF_Schema['struct_conn']['pdbx_value_order']['T'] {
+    return mmCIF_Schema.struct_conn.pdbx_value_order['T'].includes(value) ? value : 'sing'
+}
+
+export function getBirdBonds(mmcif: mmCIF_Database) {
+    const bonds: PartialStructConnRow[] = []
+
+    const mol = mmcif.pdbx_molecule
+    const molFeat = mmcif.pdbx_molecule_features
+    for (let i = 0, n = molFeat._rowCount; i < n; ++i) {
+        // console.log(Table.getRow(molFeat, i))
+        const instancesAsymIdList: { [k: number]: string[] } = {}
+        for (let j = 0, m = mol._rowCount; j < m; ++j) {
+            if (mol.prd_id.value(j) === molFeat.prd_id.value(i)) {
+                // console.log(Table.getRow(mol, j))
+                const instanceId = mol.instance_id.value(j)
+                if (instancesAsymIdList[instanceId] === undefined) {
+                    instancesAsymIdList[instanceId] = []
+                }
+                instancesAsymIdList[instanceId].push(mol.asym_id.value(j))
+            }
+        }
+        // console.log(instancesAsymIdList)
+        const entityLink = mmcif.pdbx_reference_entity_link
+        for (const instanceId of Object.keys(instancesAsymIdList)) {
+            const asymIdList = instancesAsymIdList[instanceId as any]
+            for (let j = 0, m = entityLink._rowCount; j < m; ++j) {
+                if (entityLink.prd_id.value(j) === molFeat.prd_id.value(i)) {
+                    // console.log(Table.getRow(entityLink, j))
+                    const link: PartialStructConnRow = {
+                        ptnr1_label_asym_id: asymIdList[ entityLink.component_1.value(j) - 1 ],
+                        ptnr1_label_atom_id: entityLink.atom_id_1.value(j),
+                        ptnr1_label_comp_id: entityLink.comp_id_1.value(j),
+                        ptnr1_label_seq_id: entityLink.entity_seq_num_1.value(j),
+
+                        ptnr2_label_asym_id: asymIdList[ entityLink.component_2.value(j) - 1 ],
+                        ptnr2_label_atom_id: entityLink.atom_id_2.value(j),
+                        ptnr2_label_comp_id: entityLink.comp_id_2.value(j),
+                        ptnr2_label_seq_id: entityLink.entity_seq_num_2.value(j),
+
+                        pdbx_value_order: getStructConnValueOrder(entityLink.value_order.value(j)),
+                    }
+                    // console.log(link)
+                    bonds.push(link)
+                }
+            }
+        }
+    }
+    return bonds
+}
+
+export function getCcdBonds(mmcif: mmCIF_Database) {
+    const bonds: PartialStructConnRow[] = []
+
+}
+
 async function run(pdb: string, out?: string) {
     const ccd = await getCCD()
-    // console.log(Object.keys(ccd).length)
+    const bird = await getBIRD()
 
     const mmcif = await getPdb(pdb)
     // console.log(mmcif.chem_comp.id.toArray())
 
-    // const chemCompBond_Schema = { chem_comp_bond: mmCIF_Schema.chem_comp_bond }
-    // type chemCompBond_Schema = typeof chemCompBond_Schema;
-    // type chemCompBond_Database = Database<chemCompBond_Schema>;
-    // interface chemCompBond_Database extends Database<chemCompBond_Schema> {}
+    for (const k of Object.keys(bird)) {
+        const entity = bird[k].pdbx_reference_entity_list
+        for (let i = 0, n = entity._rowCount; i < n; ++i) {
+            if (entity.ref_entity_id.value(i) !== entity.component_id.value(i).toString()) {
+                console.log(Table.getRow(entity, i))
+            }
+        }
 
-    const chemCompBondTables: Table<mmCIF_Schema['chem_comp_bond']>[] = []
+        const link = bird[k].pdbx_reference_entity_link
+        for (let i = 0, n = link._rowCount; i < n; ++i) {
+            if (link.value_order.value(i) !== 'sing') {
+                console.log(Table.getRow(link, i))
+            }
+        }
 
-    for (let i = 0, n = mmcif.chem_comp._rowCount; i < n; ++i) {
-        const ccdId = mmcif.chem_comp.id.value(i);
-        // console.log(ccdId)
-        if (ccdId in ccd) {
-            // console.log(`ccdId ${ccdId} has ${ccd[ccdId].chem_comp_atom._rowCount} atoms`)
-            // console.log(ccd[ccdId].chem_comp_bond.atom_id_1.toArray())
-            chemCompBondTables.push(ccd[ccdId].chem_comp_bond)
-        } else {
-            console.error(`ccdId ${ccdId} not found`)
+        const polyLink = bird[k].pdbx_reference_entity_poly_link
+        for (let i = 0, n = link._rowCount; i < n; ++i) {
+            if (polyLink.value_order.value(i) !== 'sing') {
+                console.log(Table.getRow(polyLink, i))
+            }
         }
     }
 
-    // for (const k of Object.keys(ccd)) {
-    //     console.log(k)
-    // }
+    const extraTables = getExtraTables(mmcif, ccd, bird)
+    const combinedMmcif = Database.ofTables('mmcif_combined', mmCIF_Schema, Object.assign({}, mmcif, extraTables))
+    // console.log(getEncodedCif(pdb, combinedMmcif))
+    // console.log(Database.getTablesAsRows(combinedMmcif))
 
-    // const combinedChemCompBonds = Database.ofTables('chemCompBond', chemCompBond_Schema, {
-    //     chem_comp_bond: Table.concat(chemCompBondTables, mmCIF_Schema.chem_comp_bond)
-    // })
-    // console.log('concat done')
-    // console.log(getEncodedCif('chemCompBond', combinedChemCompBonds))
-    const combinedMmcif = Database.ofTables('chemCompBond', mmCIF_Schema, Object.assign({},
-        mmcif,
-        { chem_comp_bond: Table.concat(chemCompBondTables, mmCIF_Schema.chem_comp_bond) }
-    ))
-    console.log(getEncodedCif(pdb, combinedMmcif))
+    // console.log(getBirdBonds(combinedMmcif))
+    console.log(getCcdBonds(combinedMmcif))
 }
 
 const DATA_DIR = './build/data'
 const CCD_PATH = `${DATA_DIR}/components.cif`
 const PVCD_PATH = `${DATA_DIR}/aa-variants-v1.cif`
+const BIRD_PATH = `${DATA_DIR}/prd-all.cif`
 const CCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif'
 const PVCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/aa-variants-v1.cif'
+const BIRD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/bird/prd/prd-all.cif.gz'
 
 const parser = new argparse.ArgumentParser({
   addHelp: true,