cif-dic.ts 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. /**
  2. * Copyright (c) 2017-2022 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  5. */
  6. import { Database, Column, EnumCol, StrCol, IntCol, ListCol, FloatCol, CoordCol, MatrixCol, VectorCol } from './schema';
  7. import { parseImportGet } from './helper';
  8. import * as Data from '../../../mol-io/reader/cif/data-model';
  9. import { CifFrame } from '../../../mol-io/reader/cif/data-model';
  10. export function getFieldType(type: string, description: string, values?: string[], container?: string): Column {
  11. switch (type) {
  12. // mmCIF
  13. case 'code':
  14. case 'line':
  15. case 'text':
  16. case 'char':
  17. case 'boolean':
  18. return values && values.length ? EnumCol(values, 'str', description) : StrCol(description);
  19. case 'ucode':
  20. case 'uline':
  21. case 'uchar3':
  22. case 'uchar1':
  23. // only force lower-case for enums
  24. return values && values.length ? EnumCol(values.map(x => x.toLowerCase()), 'lstr', description) : StrCol(description);
  25. case 'aliasname':
  26. case 'name':
  27. case 'idname':
  28. case 'any':
  29. case 'atcode':
  30. case 'fax':
  31. case 'phone':
  32. case 'email':
  33. case 'code30':
  34. case 'seq-one-letter-code':
  35. case 'author':
  36. case 'orcid_id':
  37. case 'pdbx_PDB_obsoleted_db_id':
  38. case 'pdbx_related_db_id':
  39. case 'sequence_dep':
  40. case 'pdb_id':
  41. case 'emd_id':
  42. // todo, consider adding specialised fields
  43. case 'yyyy-mm-dd':
  44. case 'yyyy-mm-dd:hh:mm':
  45. case 'yyyy-mm-dd:hh:mm-flex':
  46. case 'int-range':
  47. case 'float-range':
  48. case 'binary':
  49. case 'operation_expression':
  50. case 'point_symmetry':
  51. case '4x3_matrix':
  52. case '3x4_matrices':
  53. case 'point_group':
  54. case 'point_group_helical':
  55. case 'symmetry_operation':
  56. case 'date_dep':
  57. case 'url':
  58. case 'symop':
  59. case 'exp_data_doi':
  60. case 'asym_id':
  61. return StrCol(description);
  62. case 'int':
  63. case 'non_negative_int':
  64. case 'positive_int':
  65. return values && values.length ? EnumCol(values, 'int', description) : IntCol(description);
  66. case 'float':
  67. return FloatCol(description);
  68. case 'ec-type':
  69. case 'ucode-alphanum-csv':
  70. case 'id_list':
  71. return ListCol('str', ',', description);
  72. case 'id_list_spc':
  73. return ListCol('str', ' ', description);
  74. // cif
  75. case 'Text':
  76. case 'Code':
  77. case 'Complex':
  78. case 'Symop':
  79. case 'List':
  80. case 'List(Real,Real)':
  81. case 'List(Real,Real,Real,Real)':
  82. case 'Date':
  83. case 'DateTime':
  84. case 'Tag':
  85. case 'Implied':
  86. case 'Word':
  87. return wrapContainer('str', ',', description, container);
  88. case 'Real':
  89. return wrapContainer('float', ',', description, container);
  90. case 'Integer':
  91. return wrapContainer('int', ',', description, container);
  92. }
  93. console.log(`unknown type '${type}'`);
  94. return StrCol(description);
  95. }
  96. function ColFromType(type: 'int' | 'str' | 'float' | 'coord', description: string): Column {
  97. switch (type) {
  98. case 'int': return IntCol(description);
  99. case 'str': return StrCol(description);
  100. case 'float': return FloatCol(description);
  101. case 'coord': return CoordCol(description);
  102. }
  103. }
  104. function wrapContainer(type: 'int' | 'str' | 'float' | 'coord', separator: string, description: string, container?: string) {
  105. return container && container === 'List' ? ListCol(type, separator, description) : ColFromType(type, description);
  106. }
  107. type FrameCategories = { [category: string]: Data.CifFrame }
  108. type FrameLinks = { [k: string]: string }
  109. interface FrameData {
  110. categories: FrameCategories
  111. links: FrameLinks
  112. }
  113. type Imports = Map<string, CifFrame[]>
  114. function getImportFrames(d: Data.CifFrame, imports: Imports) {
  115. const frames: Data.CifFrame[] = [];
  116. if (!('import' in d.categories)) return frames;
  117. const importGet = parseImportGet(d.categories['import'].getField('get')!.str(0));
  118. for (const g of importGet) {
  119. const { file, save } = g;
  120. if (!file || !save) {
  121. console.warn(`missing 'save' or 'file' for import in '${d.header}'`);
  122. continue;
  123. }
  124. const importFrames = imports.get(file);
  125. if (!importFrames) {
  126. console.warn(`missing '${file}' entry in imports`);
  127. continue;
  128. }
  129. const importSave = importFrames.find(id => id.header.toLowerCase() === save.toLowerCase());
  130. if (!importSave) {
  131. console.warn(`missing '${save}' save frame in '${file}'`);
  132. continue;
  133. }
  134. frames.push(importSave);
  135. }
  136. return frames;
  137. }
  138. /** get field from given or linked category */
  139. function getField(category: string, field: string, d: Data.CifFrame, imports: Imports, ctx: FrameData): Data.CifField|undefined {
  140. const { categories, links } = ctx;
  141. const cat = d.categories[category];
  142. if (cat) {
  143. return cat.getField(field);
  144. } else if (d.header in links) {
  145. const linkName = links[d.header];
  146. if (linkName in categories) {
  147. return getField(category, field, categories[linkName], imports, ctx);
  148. } else {
  149. // console.log(`link '${linkName}' not found`)
  150. }
  151. } else {
  152. const importFrames = getImportFrames(d, imports);
  153. for (const idf of importFrames) {
  154. return getField(category, field, idf, imports, ctx);
  155. }
  156. }
  157. }
  158. function getEnums(d: Data.CifFrame, imports: Imports, ctx: FrameData) {
  159. const value = getField('item_enumeration', 'value', d, imports, ctx);
  160. const enums: string[] = [];
  161. if (value) {
  162. for (let i = 0; i < value.rowCount; ++i) {
  163. enums.push(value.str(i));
  164. // console.log(value.str(i))
  165. }
  166. return enums;
  167. } else {
  168. // console.log(`item_enumeration.value not found for '${d.header}'`)
  169. }
  170. }
  171. function getContainer(d: Data.CifFrame, imports: Imports, ctx: FrameData) {
  172. const value = getField('type', 'container', d, imports, ctx);
  173. return value ? value.str(0) : undefined;
  174. }
  175. function getCode(d: Data.CifFrame, imports: Imports, ctx: FrameData): [string, string[] | undefined, string | undefined ] | undefined {
  176. const code = getField('item_type', 'code', d, imports, ctx) || getField('type', 'contents', d, imports, ctx);
  177. if (code) {
  178. return [code.str(0), getEnums(d, imports, ctx), getContainer(d, imports, ctx)];
  179. } else {
  180. console.log(`item_type.code or type.contents not found for '${d.header}'`);
  181. }
  182. }
  183. function getSubCategory(d: Data.CifFrame, imports: Imports, ctx: FrameData): string | undefined {
  184. const value = getField('item_sub_category', 'id', d, imports, ctx);
  185. if (value) {
  186. return value.str(0);
  187. }
  188. }
  189. function getDescription(d: Data.CifFrame, imports: Imports, ctx: FrameData): string | undefined {
  190. const value = getField('item_description', 'description', d, imports, ctx) || getField('description', 'text', d, imports, ctx);
  191. if (value) {
  192. // trim (after newlines) and remove references to square brackets
  193. return value.str(0).trim()
  194. .replace(/(\r\n|\r|\n)([ \t]+)/g, '\n')
  195. .replace(/(\[[1-3]\])+ element/, 'elements')
  196. .replace(/(\[[1-3]\])+/, '');
  197. }
  198. }
  199. function getAliases(d: Data.CifFrame, imports: Imports, ctx: FrameData): string[] | undefined {
  200. const value = getField('item_aliases', 'alias_name', d, imports, ctx) || getField('alias', 'definition_id', d, imports, ctx);
  201. return value ? value.toStringArray().map(v => v.substr(1)) : undefined;
  202. }
  203. const reMatrixField = /\[[1-3]\]\[[1-3]\]/;
  204. const reVectorField = /\[[1-3]\]/;
  205. const FORCE_INT_FIELDS = [
  206. '_atom_site.id',
  207. '_atom_site.auth_seq_id',
  208. '_atom_site_anisotrop.id',
  209. '_pdbx_struct_mod_residue.auth_seq_id',
  210. '_struct_conf.beg_auth_seq_id',
  211. '_struct_conf.end_auth_seq_id',
  212. '_struct_conn.ptnr1_auth_seq_id',
  213. '_struct_conn.ptnr2_auth_seq_id',
  214. '_struct_sheet_range.beg_auth_seq_id',
  215. '_struct_sheet_range.end_auth_seq_id',
  216. ];
  217. /**
  218. * Note that name and mapped name must share a prefix. This is not always the case in
  219. * the cifCore dictionary, but for downstream code to work a container field with the
  220. * same prefix as the member fields must be given here and in the field names filter
  221. * list.
  222. */
  223. const FORCE_MATRIX_FIELDS_MAP: { [k: string]: string } = {
  224. 'atom_site_aniso.u_11': 'u', // is matrix_u in the the dic
  225. 'atom_site_aniso.u_22': 'u',
  226. 'atom_site_aniso.u_33': 'u',
  227. 'atom_site_aniso.u_23': 'u',
  228. 'atom_site_aniso.u_13': 'u',
  229. 'atom_site_aniso.u_12': 'u',
  230. };
  231. const FORCE_MATRIX_FIELDS = Object.keys(FORCE_MATRIX_FIELDS_MAP);
  232. const EXTRA_ALIASES: Database['aliases'] = {
  233. 'atom_site_aniso.matrix_u': [
  234. 'atom_site_anisotrop_U',
  235. 'atom_site_aniso.U'
  236. ],
  237. };
  238. const COMMA_SEPARATED_LIST_FIELDS = [
  239. '_atom_site.pdbx_struct_group_id',
  240. '_chem_comp.mon_nstd_parent_comp_id',
  241. '_diffrn_radiation.pdbx_wavelength_list',
  242. '_diffrn_source.pdbx_wavelength_list',
  243. '_em_diffraction.tilt_angle_list', // 20,40,50,55
  244. '_em_entity_assembly.entity_id_list',
  245. '_entity.pdbx_description', // Endolysin,Beta-2 adrenergic receptor
  246. '_entity.pdbx_ec',
  247. '_entity_poly.pdbx_strand_id', // A,B
  248. '_entity_src_gen.pdbx_gene_src_gene', // ADRB2, ADRB2R, B2AR
  249. '_pdbx_depui_entry_details.experimental_methods',
  250. '_pdbx_depui_entry_details.requested_accession_types',
  251. '_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI
  252. '_pdbx_soln_scatter_model.software_author_list', // MSI
  253. '_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC'
  254. '_pdbx_struct_assembly_gen.entity_inst_id',
  255. '_pdbx_struct_assembly_gen.asym_id_list',
  256. '_pdbx_struct_assembly_gen.auth_asym_id_list',
  257. '_pdbx_struct_assembly_gen_depositor_info.asym_id_list',
  258. '_pdbx_struct_assembly_gen_depositor_info.chain_id_list',
  259. '_pdbx_struct_group_list.group_enumeration_type',
  260. '_reflns.pdbx_diffrn_id',
  261. '_refine.pdbx_diffrn_id',
  262. '_reflns_shell.pdbx_diffrn_id',
  263. '_struct_keywords.text',
  264. ];
  265. const SPACE_SEPARATED_LIST_FIELDS = [
  266. '_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR
  267. '_pdbx_soln_scatter.data_reduction_software_list', // OTOKO
  268. '_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM
  269. ];
  270. const SEMICOLON_SEPARATED_LIST_FIELDS = [
  271. '_chem_comp.pdbx_synonyms' // GLYCERIN; PROPANE-1,2,3-TRIOL
  272. ];
  273. /**
  274. * Useful when a dictionary extension will add enum values to an existing dictionary.
  275. * By adding them here, the dictionary extension can be tested before the added enum
  276. * values are available in the existing dictionary.
  277. */
  278. const EXTRA_ENUM_VALUES: { [k: string]: string[] } = {
  279. };
  280. export function generateSchema(frames: CifFrame[], imports: Imports = new Map()): Database {
  281. const tables: Database['tables'] = {};
  282. const aliases: Database['aliases'] = { ...EXTRA_ALIASES };
  283. const categories: FrameCategories = {};
  284. const links: FrameLinks = {};
  285. const ctx = { categories, links };
  286. // get category metadata
  287. frames.forEach(d => {
  288. // category definitions in mmCIF start with '_' and don't include a '.'
  289. // category definitions in cifCore don't include a '.'
  290. if (d.header[0] === '_' || d.header.includes('.')) return;
  291. const categoryName = d.header.toLowerCase();
  292. // console.log(d.header, d.categoryNames, d.categories)
  293. let descriptionField: Data.CifField | undefined;
  294. const categoryKeyNames = new Set<string>();
  295. if ('category' in d.categories && 'category_key' in d.categories) {
  296. const category = d.categories['category'];
  297. const categoryKey = d.categories['category_key'];
  298. if (categoryKey) {
  299. const categoryKey_names = categoryKey.getField('name');
  300. if (categoryKey_names) {
  301. for (let i = 0, il = categoryKey_names.rowCount; i < il; ++i) {
  302. categoryKeyNames.add(categoryKey_names.str(i));
  303. }
  304. }
  305. }
  306. descriptionField = category.getField('description');
  307. if (categoryKeyNames.size === 0) {
  308. console.log(`no key given for category '${categoryName}'`);
  309. }
  310. }
  311. if ('description' in d.categories) {
  312. descriptionField = d.categories['description'].getField('text');
  313. }
  314. let description = '';
  315. if (descriptionField) {
  316. description = descriptionField.str(0).trim()
  317. .replace(/(\r\n|\r|\n)([ \t]+)/g, '\n'); // remove padding after newlines
  318. } else {
  319. console.log(`no description given for category '${categoryName}'`);
  320. }
  321. tables[categoryName] = { description, key: categoryKeyNames, columns: {} };
  322. // console.log('++++++++++++++++++++++++++++++++++++++++++')
  323. // console.log('name', categoryName)
  324. // console.log('desc', description)
  325. // console.log('key', categoryKeyNames)
  326. });
  327. // build list of links between categories
  328. frames.forEach(d => {
  329. if (d.header[0] !== '_' && !d.header.includes('.')) return;
  330. categories[d.header] = d;
  331. const item_linked = d.categories['item_linked'];
  332. if (item_linked) {
  333. const child_name = item_linked.getField('child_name');
  334. const parent_name = item_linked.getField('parent_name');
  335. if (child_name && parent_name) {
  336. for (let i = 0; i < item_linked.rowCount; ++i) {
  337. const childName: string = child_name.str(i);
  338. const parentName = parent_name.str(i);
  339. if (childName in links && links[childName] !== parentName) {
  340. console.log(`${childName} linked to ${links[childName]}, ignoring link to ${parentName}`);
  341. }
  342. links[childName] = parentName;
  343. }
  344. }
  345. }
  346. });
  347. // get field data
  348. Object.keys(categories).forEach(fullName => {
  349. const d = categories[fullName];
  350. if (!d) {
  351. console.log(`'${fullName}' not found, moving on`);
  352. return;
  353. }
  354. const categoryName = d.header.substring(d.header[0] === '_' ? 1 : 0, d.header.indexOf('.'));
  355. const itemName = d.header.substring(d.header.indexOf('.') + 1);
  356. let fields: { [k: string]: Column };
  357. if (categoryName in tables) {
  358. fields = tables[categoryName].columns;
  359. tables[categoryName].key.add(itemName);
  360. } else if (categoryName.toLowerCase() in tables) {
  361. // take case from category name in 'field' data as it is better if data is from cif dictionaries
  362. tables[categoryName] = tables[categoryName.toLowerCase()];
  363. fields = tables[categoryName].columns;
  364. } else {
  365. console.log(`category '${categoryName}' has no metadata`);
  366. fields = {};
  367. tables[categoryName] = {
  368. description: '',
  369. key: new Set(),
  370. columns: fields
  371. };
  372. }
  373. const itemAliases = getAliases(d, imports, ctx);
  374. if (itemAliases) aliases[`${categoryName}.${itemName}`] = itemAliases;
  375. const description = getDescription(d, imports, ctx) || '';
  376. // need to use regex to check for matrix or vector items
  377. // as sub_category assignment is missing for some entries
  378. const subCategory = getSubCategory(d, imports, ctx);
  379. if (subCategory === 'cartesian_coordinate' || subCategory === 'fractional_coordinate') {
  380. fields[itemName] = CoordCol(description);
  381. } else if (FORCE_INT_FIELDS.includes(d.header)) {
  382. fields[itemName] = IntCol(description);
  383. console.log(`forcing int: ${d.header}`);
  384. } else if (FORCE_MATRIX_FIELDS.includes(d.header)) {
  385. fields[itemName] = FloatCol(description);
  386. fields[FORCE_MATRIX_FIELDS_MAP[d.header]] = MatrixCol(3, 3, description);
  387. console.log(`forcing matrix: ${d.header}`);
  388. } else if (subCategory === 'matrix') {
  389. fields[itemName.replace(reMatrixField, '')] = MatrixCol(3, 3, description);
  390. } else if (subCategory === 'vector') {
  391. fields[itemName.replace(reVectorField, '')] = VectorCol(3, description);
  392. } else {
  393. if (itemName.match(reMatrixField)) {
  394. fields[itemName.replace(reMatrixField, '')] = MatrixCol(3, 3, description);
  395. console.log(`${d.header} should have 'matrix' _item_sub_category.id`);
  396. } else if (itemName.match(reVectorField)) {
  397. fields[itemName.replace(reVectorField, '')] = VectorCol(3, description);
  398. console.log(`${d.header} should have 'vector' _item_sub_category.id`);
  399. } else {
  400. const code = getCode(d, imports, ctx);
  401. if (code) {
  402. let fieldType = getFieldType(code[0], description, code[1], code[2]);
  403. if (fieldType.type === 'str') {
  404. if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) {
  405. fieldType = ListCol('str', ',', description);
  406. console.log(`forcing comma separated: ${d.header}`);
  407. } else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) {
  408. fieldType = ListCol('str', ' ', description);
  409. console.log(`forcing space separated: ${d.header}`);
  410. } else if (SEMICOLON_SEPARATED_LIST_FIELDS.includes(d.header)) {
  411. fieldType = ListCol('str', ';', description);
  412. console.log(`forcing space separated: ${d.header}`);
  413. }
  414. }
  415. if (d.header in EXTRA_ENUM_VALUES) {
  416. if (fieldType.type === 'enum') {
  417. fieldType.values.push(...EXTRA_ENUM_VALUES[d.header]);
  418. } else {
  419. console.warn(`expected enum: ${d.header}`);
  420. }
  421. }
  422. fields[itemName] = fieldType;
  423. } else {
  424. fields[itemName] = StrCol(description);
  425. // console.log(`could not determine code for '${d.header}'`)
  426. }
  427. }
  428. }
  429. });
  430. return { tables, aliases };
  431. }