cif-dic.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. /**
  2. * Copyright (c) 2017-2018 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  5. */
  6. import { Database, Column, EnumCol, StrCol, IntCol, ListCol, FloatCol, CoordCol, MatrixCol, VectorCol } from './schema'
  7. import * as Data from '../../../mol-io/reader/cif/data-model'
  8. import { CifFrame } from '../../../mol-io/reader/cif/data-model';
  9. export function getFieldType (type: string, description: string, values?: string[]): Column {
  10. switch (type) {
  11. case 'code':
  12. case 'ucode':
  13. case 'line':
  14. case 'uline':
  15. case 'text':
  16. case 'char':
  17. case 'uchar3':
  18. case 'uchar1':
  19. case 'boolean':
  20. return values && values.length ? EnumCol(values, 'str', description) : StrCol(description)
  21. case 'aliasname':
  22. case 'name':
  23. case 'idname':
  24. case 'any':
  25. case 'atcode':
  26. case 'fax':
  27. case 'phone':
  28. case 'email':
  29. case 'code30':
  30. case 'seq-one-letter-code':
  31. case 'author':
  32. case 'orcid_id':
  33. case 'sequence_dep':
  34. case 'pdb_id':
  35. case 'emd_id':
  36. // todo, consider adding specialised fields
  37. case 'yyyy-mm-dd':
  38. case 'yyyy-mm-dd:hh:mm':
  39. case 'yyyy-mm-dd:hh:mm-flex':
  40. case 'int-range':
  41. case 'float-range':
  42. case 'binary':
  43. case 'operation_expression':
  44. case 'point_symmetry':
  45. case '4x3_matrix':
  46. case '3x4_matrices':
  47. case 'point_group':
  48. case 'point_group_helical':
  49. case 'symmetry_operation':
  50. case 'date_dep':
  51. case 'url':
  52. case 'symop':
  53. case 'exp_data_doi':
  54. case 'asym_id':
  55. return StrCol(description)
  56. case 'int':
  57. case 'non_negative_int':
  58. case 'positive_int':
  59. return values && values.length ? EnumCol(values, 'int', description) : IntCol(description)
  60. case 'float':
  61. return FloatCol(description)
  62. case 'ec-type':
  63. case 'ucode-alphanum-csv':
  64. case 'id_list':
  65. return ListCol('str', ',', description)
  66. case 'id_list_spc':
  67. return ListCol('str', ' ', description)
  68. }
  69. console.log(`unknown type '${type}'`)
  70. return StrCol(description)
  71. }
  72. type FrameCategories = { [category: string]: Data.CifFrame }
  73. type FrameLinks = { [k: string]: string }
  74. interface FrameData {
  75. categories: FrameCategories
  76. links: FrameLinks
  77. }
  78. // get field from given or linked category
  79. function getField (category: string, field: string, d: Data.CifFrame, ctx: FrameData): Data.CifField|undefined {
  80. const { categories, links } = ctx
  81. const cat = d.categories[category]
  82. if (cat) {
  83. return cat.getField(field)
  84. } else {
  85. if (d.header in links) {
  86. const linkName = links[d.header]
  87. if (linkName in categories) {
  88. return getField(category, field, categories[linkName], ctx)
  89. } else {
  90. console.log(`link '${linkName}' not found`)
  91. }
  92. } else {
  93. // console.log(`no links found for '${d.header}'`)
  94. }
  95. }
  96. }
  97. function getEnums (d: Data.CifFrame, ctx: FrameData) {
  98. const value = getField('item_enumeration', 'value', d, ctx)
  99. const enums: string[] = []
  100. if (value) {
  101. for (let i = 0; i < value.rowCount; ++i) {
  102. enums.push(value.str(i))
  103. // console.log(value.str(i))
  104. }
  105. return enums
  106. } else {
  107. // console.log(`item_enumeration.value not found for '${d.header}'`)
  108. }
  109. }
  110. function getCode (d: Data.CifFrame, ctx: FrameData): [string, string[]|undefined]|undefined {
  111. const code = getField('item_type', 'code', d, ctx)
  112. if (code) {
  113. return [ code.str(0), getEnums(d, ctx) ]
  114. } else {
  115. console.log(`item_type.code not found for '${d.header}'`)
  116. }
  117. }
  118. function getSubCategory (d: Data.CifFrame, ctx: FrameData): string|undefined {
  119. const value = getField('item_sub_category', 'id', d, ctx)
  120. if (value) {
  121. return value.str(0)
  122. }
  123. }
  124. function getDescription (d: Data.CifFrame, ctx: FrameData): string|undefined {
  125. const value = getField('item_description', 'description', d, ctx)
  126. if (value) {
  127. // trim (after newlines) and remove references to square brackets
  128. return value.str(0).trim()
  129. .replace(/(\r\n|\r|\n)([ \t]+)/g, '\n')
  130. .replace(/(\[[1-3]\])+ element/, 'elements')
  131. .replace(/(\[[1-3]\])+/, '')
  132. }
  133. }
  134. const reMatrixField = /\[[1-3]\]\[[1-3]\]/
  135. const reVectorField = /\[[1-3]\]/
  136. const FORCE_INT_FIELDS = [
  137. '_atom_site.id',
  138. '_atom_site.auth_seq_id',
  139. '_pdbx_struct_mod_residue.auth_seq_id',
  140. '_struct_conf.beg_auth_seq_id',
  141. '_struct_conf.end_auth_seq_id',
  142. '_struct_conn.ptnr1_auth_seq_id',
  143. '_struct_conn.ptnr2_auth_seq_id',
  144. '_struct_sheet_range.beg_auth_seq_id',
  145. '_struct_sheet_range.end_auth_seq_id',
  146. ];
  147. const COMMA_SEPARATED_LIST_FIELDS = [
  148. '_atom_site.pdbx_struct_group_id',
  149. '_chem_comp.mon_nstd_parent_comp_id',
  150. '_diffrn_radiation.pdbx_wavelength_list',
  151. '_diffrn_source.pdbx_wavelength_list',
  152. '_em_diffraction.tilt_angle_list', // 20,40,50,55
  153. '_em_entity_assembly.entity_id_list',
  154. '_entity.pdbx_description', // Endolysin,Beta-2 adrenergic receptor
  155. '_entity.pdbx_ec',
  156. '_entity_poly.pdbx_strand_id', // A,B
  157. '_entity_src_gen.pdbx_gene_src_gene', // ADRB2, ADRB2R, B2AR
  158. '_pdbx_depui_entry_details.experimental_methods',
  159. '_pdbx_depui_entry_details.requested_accession_types',
  160. '_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI
  161. '_pdbx_soln_scatter_model.software_author_list', // MSI
  162. '_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC'
  163. '_pdbx_struct_assembly_gen.entity_inst_id',
  164. '_pdbx_struct_assembly_gen.asym_id_list',
  165. '_pdbx_struct_assembly_gen.auth_asym_id_list',
  166. '_pdbx_struct_assembly_gen_depositor_info.asym_id_list',
  167. '_pdbx_struct_assembly_gen_depositor_info.chain_id_list',
  168. '_pdbx_struct_group_list.group_enumeration_type',
  169. '_reflns.pdbx_diffrn_id',
  170. '_refine.pdbx_diffrn_id',
  171. '_reflns_shell.pdbx_diffrn_id',
  172. '_struct_keywords.text',
  173. ];
  174. const SPACE_SEPARATED_LIST_FIELDS = [
  175. '_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR
  176. '_pdbx_soln_scatter.data_reduction_software_list', // OTOKO
  177. '_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM
  178. ];
  179. const SEMICOLON_SEPARATED_LIST_FIELDS = [
  180. '_chem_comp.pdbx_synonyms' // GLYCERIN; PROPANE-1,2,3-TRIOL
  181. ]
  182. /**
  183. * Useful when a dictionary extension will add enum values to an existing dictionary.
  184. * By adding them here, the dictionary extension can be tested before the added enum
  185. * values are available in the existing dictionary.
  186. */
  187. const EXTRA_ENUM_VALUES: { [k: string]: string[] } = {
  188. }
  189. export function generateSchema (frames: CifFrame[]) {
  190. const schema: Database = {}
  191. const categories: FrameCategories = {}
  192. const links: FrameLinks = {}
  193. const ctx = { categories, links }
  194. // get category metadata
  195. frames.forEach(d => {
  196. if (d.header[0] === '_') return
  197. const categoryKeyNames = new Set<string>()
  198. const categoryKey = d.categories['category_key']
  199. if (categoryKey) {
  200. const categoryKey_names = categoryKey.getField('name')
  201. if (categoryKey_names) {
  202. for (let i = 0, il = categoryKey_names.rowCount; i < il; ++i) {
  203. categoryKeyNames.add(categoryKey_names.str(i))
  204. }
  205. }
  206. }
  207. let description = ''
  208. const category = d.categories['category']
  209. if (category) {
  210. const category_description = category.getField('description')
  211. if (category_description) {
  212. description = category_description.str(0).trim()
  213. .replace(/(\r\n|\r|\n)([ \t]+)/g, '\n') // remove padding after newlines
  214. } else {
  215. console.log(`no description given for category '${category}'`)
  216. }
  217. }
  218. if (categoryKeyNames.size === 0) {
  219. console.log(`no key given for category '${category}'`)
  220. }
  221. schema[d.header] = { description, key: categoryKeyNames, columns: {} }
  222. // console.log('++++++++++++++++++++++++++++++++++++++++++')
  223. // console.log('name', d.header)
  224. // console.log('desc', description)
  225. // console.log('key', categoryKeyNames)
  226. })
  227. // build list of links between categories
  228. frames.forEach(d => {
  229. if (d.header[0] !== '_') return
  230. categories[d.header] = d
  231. const item_linked = d.categories['item_linked']
  232. if (item_linked) {
  233. const child_name = item_linked.getField('child_name')
  234. const parent_name = item_linked.getField('parent_name')
  235. if (child_name && parent_name) {
  236. for (let i = 0; i < item_linked.rowCount; ++i) {
  237. const childName = child_name.str(i)
  238. const parentName = parent_name.str(i)
  239. if (childName in links && links[childName] !== parentName) {
  240. console.log(`${childName} linked to ${links[childName]}, ignoring link to ${parentName}`)
  241. }
  242. links[childName] = parentName
  243. }
  244. }
  245. }
  246. })
  247. // get field data
  248. Object.keys(categories).forEach(fullName => {
  249. const d = categories[fullName]
  250. if (!d) {
  251. console.log(`${fullName} not found, moving on`)
  252. return
  253. }
  254. const categoryName = d.header.substring(1, d.header.indexOf('.'))
  255. const itemName = d.header.substring(d.header.indexOf('.') + 1)
  256. let fields: { [k: string]: Column }
  257. if (categoryName in schema) {
  258. fields = schema[categoryName].columns
  259. } else {
  260. console.log(`category '${categoryName}' has no metadata`)
  261. fields = {}
  262. schema[categoryName] = {
  263. description: '',
  264. key: new Set(),
  265. columns: fields
  266. }
  267. }
  268. const description = getDescription(d, ctx) || ''
  269. // need to use regex to check for matrix or vector items
  270. // as sub_category assignment is missing for some entries
  271. const subCategory = getSubCategory(d, ctx)
  272. if (subCategory === 'cartesian_coordinate' || subCategory === 'fractional_coordinate') {
  273. fields[itemName] = CoordCol(description)
  274. } else if (FORCE_INT_FIELDS.includes(d.header)) {
  275. fields[itemName] = IntCol(description)
  276. console.log(`forcing int: ${d.header}`)
  277. } else if (subCategory === 'matrix') {
  278. fields[itemName.replace(reMatrixField, '')] = MatrixCol(3, 3, description)
  279. } else if (subCategory === 'vector') {
  280. fields[itemName.replace(reVectorField, '')] = VectorCol(3, description)
  281. } else {
  282. if (itemName.match(reMatrixField)) {
  283. fields[itemName.replace(reMatrixField, '')] = MatrixCol(3, 3, description)
  284. console.log(`${d.header} should have 'matrix' _item_sub_category.id`)
  285. } else if (itemName.match(reVectorField)) {
  286. fields[itemName.replace(reVectorField, '')] = VectorCol(3, description)
  287. console.log(`${d.header} should have 'vector' _item_sub_category.id`)
  288. } else {
  289. const code = getCode(d, ctx)
  290. if (code) {
  291. let fieldType = getFieldType(code[0], description, code[1]);
  292. if (fieldType.type === 'str') {
  293. if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) {
  294. fieldType = ListCol('str', ',', description)
  295. console.log(`forcing comma separated: ${d.header}`)
  296. } else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) {
  297. fieldType = ListCol('str', ' ', description)
  298. console.log(`forcing space separated: ${d.header}`)
  299. } else if (SEMICOLON_SEPARATED_LIST_FIELDS.includes(d.header)) {
  300. fieldType = ListCol('str', ';', description)
  301. console.log(`forcing space separated: ${d.header}`)
  302. }
  303. }
  304. if (d.header in EXTRA_ENUM_VALUES) {
  305. if (fieldType.type === 'enum') {
  306. fieldType.values.push(...EXTRA_ENUM_VALUES[d.header])
  307. } else {
  308. console.warn(`expected enum: ${d.header}`)
  309. }
  310. }
  311. fields[itemName] = fieldType
  312. } else {
  313. console.log(`could not determine code for '${d.header}'`)
  314. }
  315. }
  316. }
  317. })
  318. return schema
  319. }