schema-from-mmcif-dic.ts 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. /**
  2. * Copyright (c) 2017-2018 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  5. */
  6. import * as argparse from 'argparse'
  7. // import * as util from 'util'
  8. import * as fs from 'fs'
  9. import fetch from 'node-fetch'
  10. import Csv from 'mol-io/reader/csv/parser'
  11. import CIF from 'mol-io/reader/cif'
  12. import { generateSchema } from './util/cif-dic'
  13. import { generate } from './util/generate'
  14. import { Filter, mergeFilters } from './util/json-schema'
  15. import { Run } from 'mol-task';
  16. async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount = 0, typescript = false, out?: string) {
  17. await ensureMmcifDicAvailable()
  18. const comp = CIF.parseText(fs.readFileSync(MMCIF_DIC_PATH, 'utf8'))
  19. const parsed = await Run(comp);
  20. if (parsed.isError) throw parsed
  21. // console.log(fieldNamesPath, minCount)
  22. let filter: Filter | undefined
  23. if (minCount && fieldNamesPath) {
  24. filter = mergeFilters(
  25. await getUsageCountsFilter(minCount),
  26. await getFieldNamesFilter(fieldNamesPath)
  27. )
  28. } else if (minCount) {
  29. filter = await getUsageCountsFilter(minCount)
  30. } else if (fieldNamesPath) {
  31. filter = await getFieldNamesFilter(fieldNamesPath)
  32. }
  33. const schema = generateSchema(parsed.result.blocks[0])
  34. const output = typescript ? generate(name, schema, filter) : JSON.stringify(schema, undefined, 4)
  35. if (out) {
  36. fs.writeFileSync(out, output)
  37. } else {
  38. console.log(output)
  39. }
  40. }
  41. async function getFieldNamesFilter(fieldNamesPath: string): Promise<Filter> {
  42. const fieldNamesStr = fs.readFileSync(fieldNamesPath, 'utf8')
  43. const parsed = await Run(Csv(fieldNamesStr, { noColumnNames: true }));
  44. if (parsed.isError) throw parser.error
  45. const csvFile = parsed.result;
  46. const fieldNamesCol = csvFile.table.getColumn('0')
  47. if (!fieldNamesCol) throw 'error getting fields columns'
  48. const fieldNames = fieldNamesCol.toStringArray()
  49. const filter: Filter = {}
  50. fieldNames.forEach((name, i) => {
  51. const [ category, field ] = name.split('.')
  52. // console.log(category, field)
  53. if (!filter[ category ]) filter[ category ] = {}
  54. filter[ category ][ field ] = true
  55. })
  56. // console.log(filter)
  57. return filter
  58. }
  59. async function getUsageCountsFilter(minCount: number): Promise<Filter> {
  60. const usageCountsStr = fs.readFileSync(MMCIF_USAGE_COUNTS_PATH, 'utf8')
  61. const parsed = await Run(Csv(usageCountsStr, { delimiter: ' ' }));
  62. if (parsed.isError) throw parser.error
  63. const csvFile = parsed.result;
  64. const fieldNamesCol = csvFile.table.getColumn('field_name')
  65. const usageCountsCol = csvFile.table.getColumn('usage_count')
  66. if (!fieldNamesCol || !usageCountsCol) throw 'error getting usage columns'
  67. const fieldNames = fieldNamesCol.toStringArray()
  68. const usageCounts = usageCountsCol.toIntArray()
  69. const filter: Filter = {}
  70. fieldNames.forEach((name, i) => {
  71. if (usageCounts[i] < minCount) return
  72. const [ category, field ] = name.substr(1).split('.')
  73. if (!filter[ category ]) filter[ category ] = {}
  74. filter[ category ][ field ] = true
  75. })
  76. return filter
  77. }
  78. async function ensureMmcifDicAvailable() {
  79. if (FORCE_MMCIF_DOWNLOAD || !fs.existsSync(MMCIF_DIC_PATH)) {
  80. console.log('downloading mmcif dic...')
  81. const data = await fetch(MMCIF_DIC_URL)
  82. if (!fs.existsSync(MMCIF_DIC_DIR)) {
  83. fs.mkdirSync(MMCIF_DIC_DIR);
  84. }
  85. fs.writeFileSync(MMCIF_DIC_PATH, await data.text())
  86. console.log('done downloading mmcif dic')
  87. }
  88. }
  89. const MMCIF_USAGE_COUNTS_PATH = './data/mmcif-usage-counts.txt'
  90. const MMCIF_DIC_DIR = './build/dics'
  91. const MMCIF_DIC_PATH = `${MMCIF_DIC_DIR}/mmcif_pdbx_v50.dic`
  92. const MMCIF_DIC_URL = 'http://mmcif.wwpdb.org/dictionaries/ascii/mmcif_pdbx_v50.dic'
  93. const parser = new argparse.ArgumentParser({
  94. addHelp: true,
  95. description: 'Create schema from mmcif dictionary (v50, downloaded from wwPDB)'
  96. });
  97. parser.addArgument([ '--name', '-n' ], {
  98. defaultValue: 'mmCIF',
  99. help: 'Schema name'
  100. });
  101. parser.addArgument([ '--out', '-o' ], {
  102. help: 'Generated schema output path, if not given printed to stdout'
  103. });
  104. parser.addArgument([ '--typescript', '-ts' ], {
  105. action: 'storeTrue',
  106. help: 'Output schema as TypeScript instead of as JSON'
  107. });
  108. parser.addArgument([ '--minFieldUsageCount', '-mc' ], {
  109. defaultValue: 0,
  110. type: parseInt,
  111. help: 'Minimum mmcif field usage counts'
  112. });
  113. parser.addArgument([ '--fieldNamesPath', '-fn' ], {
  114. defaultValue: '',
  115. help: 'Field names to include'
  116. });
  117. parser.addArgument([ '--forceMmcifDicDownload', '-f' ], {
  118. action: 'storeTrue',
  119. help: 'Force download of mmcif dictionary'
  120. });
  121. interface Args {
  122. name: string
  123. forceMmcifDicDownload: boolean
  124. fieldNamesPath: string
  125. minFieldUsageCount: number
  126. typescript: boolean
  127. out: string
  128. }
  129. const args: Args = parser.parseArgs();
  130. const FORCE_MMCIF_DOWNLOAD = args.forceMmcifDicDownload
  131. if (args.name) {
  132. runGenerateSchema(args.name, args.fieldNamesPath, args.minFieldUsageCount, args.typescript, args.out)
  133. }