Bläddra i källkod

improved schema generation from mmcif dic

Alexander Rose 7 år sedan
förälder
incheckning
e9b57bb89b

+ 88 - 0
data/bird-field-names.csv

@@ -0,0 +1,88 @@
+pdbx_reference_molecule.prd_id
+pdbx_reference_molecule.name
+pdbx_reference_molecule.represent_as
+pdbx_reference_molecule.type
+pdbx_reference_molecule.type_evidence_code
+pdbx_reference_molecule.class
+pdbx_reference_molecule.class_evidence_code
+pdbx_reference_molecule.formula
+pdbx_reference_molecule.chem_comp_id
+pdbx_reference_molecule.formula_weight
+pdbx_reference_molecule.release_status
+pdbx_reference_molecule.replaces
+pdbx_reference_molecule.replaced_by
+pdbx_reference_molecule.compound_detail
+pdbx_reference_molecule.description
+pdbx_reference_molecule.representative_PDB_id_code
+
+pdbx_reference_entity_list.prd_id
+pdbx_reference_entity_list.ref_entity_id
+pdbx_reference_entity_list.component_id
+pdbx_reference_entity_list.type
+pdbx_reference_entity_list.details
+
+pdbx_reference_entity_nonpoly.prd_id
+pdbx_reference_entity_nonpoly.ref_entity_id
+pdbx_reference_entity_nonpoly.name
+pdbx_reference_entity_nonpoly.chem_comp_id
+
+pdbx_reference_entity_link.prd_id
+pdbx_reference_entity_link.link_id
+pdbx_reference_entity_link.link_class
+pdbx_reference_entity_link.ref_entity_id_1
+pdbx_reference_entity_link.entity_seq_num_1
+pdbx_reference_entity_link.comp_id_1
+pdbx_reference_entity_link.atom_id_1
+pdbx_reference_entity_link.ref_entity_id_2
+pdbx_reference_entity_link.entity_seq_num_2
+pdbx_reference_entity_link.comp_id_2
+pdbx_reference_entity_link.atom_id_2
+pdbx_reference_entity_link.value_order
+pdbx_reference_entity_link.component_1
+pdbx_reference_entity_link.component_2
+pdbx_reference_entity_link.details
+
+pdbx_reference_entity_poly_link.prd_id
+pdbx_reference_entity_poly_link.ref_entity_id
+pdbx_reference_entity_poly_link.link_id
+pdbx_reference_entity_poly_link.atom_id_1
+pdbx_reference_entity_poly_link.comp_id_1
+pdbx_reference_entity_poly_link.entity_seq_num_1
+pdbx_reference_entity_poly_link.atom_id_2
+pdbx_reference_entity_poly_link.comp_id_2
+pdbx_reference_entity_poly_link.entity_seq_num_2
+pdbx_reference_entity_poly_link.value_order
+pdbx_reference_entity_poly_link.component_id
+
+pdbx_reference_entity_poly.prd_id
+pdbx_reference_entity_poly.ref_entity_id
+pdbx_reference_entity_poly.db_code
+pdbx_reference_entity_poly.db_name
+pdbx_reference_entity_poly.type
+
+pdbx_reference_entity_sequence.prd_id
+pdbx_reference_entity_sequence.ref_entity_id
+pdbx_reference_entity_sequence.type
+pdbx_reference_entity_sequence.NRP_flag
+pdbx_reference_entity_sequence.one_letter_codes
+
+pdbx_reference_entity_poly_seq.prd_id
+pdbx_reference_entity_poly_seq.ref_entity_id
+pdbx_reference_entity_poly_seq.num
+pdbx_reference_entity_poly_seq.mon_id
+pdbx_reference_entity_poly_seq.parent_mon_id
+pdbx_reference_entity_poly_seq.hetero
+pdbx_reference_entity_poly_seq.observed
+
+pdbx_reference_entity_src_nat.prd_id
+pdbx_reference_entity_src_nat.ref_entity_id
+pdbx_reference_entity_src_nat.ordinal
+pdbx_reference_entity_src_nat.taxid
+pdbx_reference_entity_src_nat.organism_scientific
+pdbx_reference_entity_src_nat.db_code
+pdbx_reference_entity_src_nat.db_name
+
+pdbx_prd_audit.prd_id
+pdbx_prd_audit.date
+pdbx_prd_audit.processing_site
+pdbx_prd_audit.action_type

+ 4 - 5
src/apps/schema-generator/schema-from-mmcif-dic.ts

@@ -21,7 +21,7 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount
     const parsed = await comp();
     if (parsed.isError) throw parsed
 
-    console.log(fieldNamesPath, minCount)
+    // console.log(fieldNamesPath, minCount)
 
     let filter: Filter | undefined
     if (minCount && fieldNamesPath) {
@@ -32,7 +32,6 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount
     } else if (minCount) {
         filter = await getUsageCountsFilter(minCount)
     } else if (fieldNamesPath) {
-        console.log('MOIN')
         filter = await getFieldNamesFilter(fieldNamesPath)
     }
 
@@ -59,11 +58,11 @@ async function getFieldNamesFilter(fieldNamesPath: string): Promise<Filter> {
     const filter: Filter = {}
     fieldNames.forEach((name, i) => {
         const [ category, field ] = name.split('.')
-        console.log(category, field)
+        // console.log(category, field)
         if (!filter[ category ]) filter[ category ] = {}
         filter[ category ][ field ] = true
     })
-    console.log(filter)
+    // console.log(filter)
     return filter
 }
 
@@ -93,7 +92,7 @@ async function ensureMmcifDicAvailable() {
     if (FORCE_MMCIF_DOWNLOAD || !fs.existsSync(MMCIF_DIC_PATH)) {
         console.log('downloading mmcif dic...')
         const data = await fetch(MMCIF_DIC_URL)
-        if (!fs.existsSync(MMCIF_DIC_DIR)){
+        if (!fs.existsSync(MMCIF_DIC_DIR)) {
             fs.mkdirSync(MMCIF_DIC_DIR);
         }
         fs.writeFileSync(MMCIF_DIC_PATH, await data.text())

+ 72 - 32
src/apps/schema-generator/util/cif-dic.ts

@@ -4,22 +4,25 @@
  * @author Alexander Rose <alexander.rose@weirdbyte.de>
  */
 
-import { Database, Column } from './json-schema'
+import { Database, ValueColumn, ListColumn } from './json-schema'
 import * as Data from 'mol-io/reader/cif/data-model'
 
-export function getFieldType (type: string, values?: string[]): Column {
+export function getFieldType (type: string, values?: string[]): ValueColumn|ListColumn {
     switch (type) {
         case 'code':
         case 'ucode':
-            if (values && values.length) {
-                return { 'enum': values }
-            } else {
-                return 'str'
-            }
         case 'line':
         case 'uline':
         case 'text':
         case 'char':
+        case 'uchar3':
+        case 'uchar1':
+        case 'boolean':
+            if (values && values.length) {
+                return { enum: [ 'str', values ] }
+            } else {
+                return 'str'
+            }
         case 'aliasname':
         case 'name':
         case 'idname':
@@ -29,7 +32,6 @@ export function getFieldType (type: string, values?: string[]): Column {
         case 'phone':
         case 'email':
         case 'code30':
-        case 'ec-type':
         case 'seq-one-letter-code':
         case 'author':
         case 'orcid_id':
@@ -44,27 +46,30 @@ export function getFieldType (type: string, values?: string[]): Column {
         case 'float-range':
         case 'binary':
         case 'operation_expression':
-        case 'ucode-alphanum-csv':
         case 'point_symmetry':
-        case 'id_list':
         case '4x3_matrix':
         case '3x4_matrices':
         case 'point_group':
         case 'point_group_helical':
-        case 'boolean':
         case 'symmetry_operation':
         case 'date_dep':
-        case 'uchar3':
-        case 'uchar1':
         case 'url':
         case 'symop':
             return 'str'
         case 'int':
         case 'non_negative_int':
         case 'positive_int':
-            return 'int'
+            if (values && values.length) {
+                return { enum: [ 'int', values ] }
+            } else {
+                return 'int'
+            }
         case 'float':
             return 'float'
+        case 'ec-type':
+        case 'ucode-alphanum-csv':
+        case 'id_list':
+            return { list: [ 'str', ',' ] }
     }
     console.log(`unknown type '${type}'`)
     return 'str'
@@ -94,10 +99,10 @@ function getField ( category: string, field: string, d: Data.Frame, ctx: FrameDa
     }
 }
 
-function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined {
+function getEnums (d: Data.Frame, ctx: FrameData) {
     const value = getField('item_enumeration', 'value', d, ctx)
+    const enums: string[] = []
     if (value) {
-        const enums: string[] = []
         for (let i = 0; i < value.rowCount; ++i) {
             enums.push(value.str(i))
             // console.log(value.str(i))
@@ -108,16 +113,10 @@ function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined {
     }
 }
 
-function getCode (d: Data.Frame, ctx: FrameData): [string, string[]]|undefined {
+function getCode (d: Data.Frame, ctx: FrameData): [string, string[]|undefined]|undefined {
     const code = getField('item_type', 'code', d, ctx)
     if (code) {
-        let c = code.str(0)
-        let e = []
-        if (c === 'ucode') {
-            const enums = getEnums(d, ctx)
-            if (enums) e.push(...enums)
-        }
-        return [c, e]
+        return [ code.str(0), getEnums(d, ctx) ]
     } else {
         console.log(`item_type.code not found for '${d.header}'`)
     }
@@ -131,15 +130,46 @@ function getSubCategory (d: Data.Frame, ctx: FrameData): string|undefined {
 }
 
 const FORCE_INT_FIELDS = [
+    '_atom_site.id',
+    '_atom_site.auth_seq_id',
+    '_pdbx_struct_mod_residue.auth_seq_id',
     '_struct_conf.beg_auth_seq_id',
     '_struct_conf.end_auth_seq_id',
-    '_struct_sheet_range.beg_auth_seq_id',
-    '_struct_sheet_range.end_auth_seq_id',
     '_struct_conn.ptnr1_auth_seq_id',
     '_struct_conn.ptnr2_auth_seq_id',
-    '_pdbx_struct_mod_residue.auth_seq_id',
-    '_atom_site.id',
-    '_atom_site.auth_seq_id'
+    '_struct_sheet_range.beg_auth_seq_id',
+    '_struct_sheet_range.end_auth_seq_id',
+];
+
+const COMMA_SEPARATED_LIST_FIELDS = [
+    '_atom_site.pdbx_struct_group_id',
+    '_chem_comp.mon_nstd_parent_comp_id',
+    '_diffrn_radiation.pdbx_wavelength_list',
+    '_diffrn_source.pdbx_wavelength_list',
+    '_em_diffraction.tilt_angle_list', // 20,40,50,55
+    '_em_entity_assembly.entity_id_list',
+    '_entity.pdbx_ec',
+    '_pdbx_depui_entry_details.experimental_methods',
+    '_pdbx_depui_entry_details.requested_accession_types',
+    '_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI
+    '_pdbx_soln_scatter_model.software_author_list', // MSI
+    '_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC'
+    '_pdbx_struct_assembly_gen.entity_inst_id',
+    '_pdbx_struct_assembly_gen.asym_id_list',
+    '_pdbx_struct_assembly_gen.auth_asym_id_list',
+    '_pdbx_struct_assembly_gen_depositor_info.asym_id_list',
+    '_pdbx_struct_assembly_gen_depositor_info.chain_id_list',
+    '_pdbx_struct_group_list.group_enumeration_type',
+    '_reflns.pdbx_diffrn_id',
+    '_refine.pdbx_diffrn_id',
+    '_reflns_shell.pdbx_diffrn_id',
+    '_struct_keywords.text',
+];
+
+const SPACE_SEPARATED_LIST_FIELDS = [
+    '_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR
+    '_pdbx_soln_scatter.data_reduction_software_list', // OTOKO
+    '_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM
 ];
 
 export function generateSchema (dic: Data.Block) {
@@ -195,14 +225,24 @@ export function generateSchema (dic: Data.Block) {
         } else {
             if (itemName.match(/\[[1-3]\]\[[1-3]\]/)) {
                 fields[itemName.replace(/\[[1-3]\]\[[1-3]\]/, '')] = { 'matrix': [ 3, 3 ] }
-                // console.log(`${d.header} should have 'matrix' _item_sub_category.id`)
+                console.log(`${d.header} should have 'matrix' _item_sub_category.id`)
             } else if (itemName.match(/\[[1-3]\]/)) {
                 fields[itemName.replace(/\[[1-3]\]/, '')] = { 'vector': [ 3 ] }
-                // console.log(`${d.header} should have 'vector' _item_sub_category.id`)
+                console.log(`${d.header} should have 'vector' _item_sub_category.id`)
             } else {
                 const code = getCode(d, ctx)
                 if (code) {
-                    fields[itemName] = getFieldType(code[0], code[1])
+                    let fieldType = getFieldType(code[0], code[1]);
+                    if (typeof fieldType === 'string') {
+                        if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) {
+                            fieldType = { 'list': [ 'str', ',' ] };
+                            console.log(`comma separated: ${d.header}`)
+                        } else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) {
+                            fieldType = { 'list': [ 'str', ' ' ] };
+                            console.log(`space separated: ${d.header}`)
+                        }
+                    }
+                    fields[itemName] = fieldType
                 } else {
                     console.log(`could not determine code for '${d.header}'`)
                 }

+ 16 - 6
src/apps/schema-generator/util/generate.ts

@@ -27,7 +27,8 @@ const coord = Schema.coord;
 
 const Aliased = Schema.Aliased;
 const Matrix = Schema.Matrix;
-const Vector = Schema.Vector;`
+const Vector = Schema.Vector;
+const List = Schema.List;`
 }
 
 function footer (name: string) {
@@ -37,14 +38,23 @@ export interface ${name}_Database extends Database<${name}_Schema> { }`
 }
 
 const value: { [k: string]: (...args: any[]) => string } = {
-    enum: function (...values: string[]) {
-        return `Aliased<'${values.join(`' | '`)}'>(str)`
+    enum: function (type: string, values: string[]) {
+        return `Aliased<'${values.join(`' | '`)}'>(${type})`
     },
     matrix: function (rows: number, cols: number) {
         return `Matrix(${rows}, ${cols})`
     },
     vector: function (dim: number) {
         return `Vector(${dim})`
+    },
+    list: function (type: 'str'|'int'|'float', separator: string) {
+        if (type === 'int') {
+            return `List('${separator}', x => parseInt(x, 10))`
+        } else if (type === 'float') {
+            return `List('${separator}', x => parseFloat(x))`
+        } else {
+            return `List('${separator}', x => x)`
+        }
     }
 }
 
@@ -64,7 +74,7 @@ export function generate (name: string, schema: Database, fields?: Filter, impor
     codeLines.push(`export const ${name}_Schema = {`)
     Object.keys(schema).forEach(table => {
         if (fields && !fields[ table ]) return
-        codeLines.push(`\t${safePropertyString(table)}: {`)
+        codeLines.push(`    ${safePropertyString(table)}: {`)
         const columns = schema[ table ]
         Object.keys(columns).forEach(columnName => {
             if (fields && !fields[ table ][ columnName ]) return
@@ -76,9 +86,9 @@ export function generate (name: string, schema: Database, fields?: Filter, impor
             } else {
                 typeDef = fieldType
             }
-            codeLines.push(`\t\t${safePropertyString(columnName)}: ${typeDef},`)
+            codeLines.push(`        ${safePropertyString(columnName)}: ${typeDef},`)
         })
-        codeLines.push('\t},')
+        codeLines.push('    },')
     })
     codeLines.push('}')
 

+ 7 - 2
src/apps/schema-generator/util/json-schema.ts

@@ -12,7 +12,8 @@ export interface Table {
     [ columnName: string ]: Column
 }
 
-export type Column = IntCol | StrCol | FloatCol | CoordCol | EnumCol | VectorCol | MatrixCol
+export type ValueColumn = IntCol | StrCol | FloatCol | CoordCol | EnumCol
+export type Column = ValueColumn | VectorCol | MatrixCol | ListColumn
 
 type IntCol = 'int'
 type StrCol = 'str'
@@ -24,7 +25,7 @@ interface ComplexColumn {
 }
 
 interface EnumCol extends ComplexColumn {
-    enum: string[]
+    enum: [ IntCol | StrCol, string[] ]
 }
 
 interface VectorCol extends ComplexColumn {
@@ -35,6 +36,10 @@ interface MatrixCol extends ComplexColumn {
     matrix: [ number, number ]
 }
 
+export interface ListColumn extends ComplexColumn {
+    list: [ ValueColumn, string ]
+}
+
 export function getTypeAndArgs (column: ComplexColumn) {
     const type = Object.keys(column)[0] as string
     const args = column[ type ]

+ 12 - 3
src/apps/schema-generator/util/validate.ts

@@ -7,12 +7,16 @@
 import { Database, Table, Column } from './json-schema'
 
 const SimpleColumnTypes = [ 'str', 'int', 'float', 'coord' ]
-const ComplexColumnTypes = [ 'enum', 'vector', 'matrix' ]
+const ComplexColumnTypes = [ 'enum', 'vector', 'matrix', 'list' ]
 
 function allTrue<T> (list: T[], fn: (e: T) => boolean) {
     return list.reduce((a, v) => a && fn(v), true)
 }
 
+function allString (list: string[]) {
+    return list.reduce((a, v) => a && typeof v === 'string', true)
+}
+
 function validateColumn (column: Column): true|Error {
     if (typeof column === 'string') {
         if (!SimpleColumnTypes.includes(column)) {
@@ -31,8 +35,8 @@ function validateColumn (column: Column): true|Error {
         }
         switch (type) {
             case 'enum':
-                if (!args.reduce((a, v) => a && typeof v === 'string', true)) {
-                    return new Error(`enum column must have string args`)
+                if (args.length !== 2 && (!allString(args[1]) && !allTrue(args[1], Number.isInteger))) {
+                    return new Error(`enum column must have all string or all integer args ${args}`)
                 }
                 break;
             case 'vector':
@@ -45,6 +49,11 @@ function validateColumn (column: Column): true|Error {
                     return new Error(`matrix column must have two integer args`)
                 }
                 break;
+            case 'list':
+                if (args.length !== 2 || !allString(args)) {
+                    return new Error(`list column must have two string args`)
+                }
+                break;
             default:
                 return new Error(`complex column types must be one of '${ComplexColumnTypes.join(', ')}' not '${type}'`)
         }