Browse Source

change SDF data header parsing
- do not trim <> around field
- store whole line staring with '> ' as data header (without the staring '> ')

dsehnal 3 years ago
parent
commit
06b9c5f2de
3 changed files with 15 additions and 10 deletions
  1. 1 0
      CHANGELOG.md
  2. 10 7
      src/mol-io/reader/_spec/sdf.spec.ts
  3. 4 3
      src/mol-io/reader/sdf/parser.ts

+ 1 - 0
CHANGELOG.md

@@ -7,6 +7,7 @@ Note that since we don't clearly distinguish between a public and private interf
 ## [Unreleased]
 
 - Add `tubularHelices` parameter to Cartoon representation
+- Add `SdfFormat` and update SDF parser to be able to parse data headers according to spec (hopefully :)) #230
 
 ## [v2.1.0] - 2021-07-05
 

+ 10 - 7
src/mol-io/reader/_spec/sdf.spec.ts

@@ -22,8 +22,8 @@ M  END
 > <DATABASE_NAME>
 drugbank
 
-> <SMILES>
-[O-]P([O-])([O-])=O
+> 5225 <TEST_FIELD>
+whatever
 
 > <INCHI_IDENTIFIER>
 InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)/p-3
@@ -362,22 +362,25 @@ describe('sdf reader', () => {
         expect(bonds.atomIdxB.value(3)).toBe(5);
         expect(bonds.order.value(3)).toBe(1);
 
-        expect(dataItems.dataHeader.value(0)).toBe('DATABASE_ID');
+        expect(dataItems.dataHeader.value(0)).toBe('<DATABASE_ID>');
         expect(dataItems.data.value(0)).toBe('0');
 
-        expect(dataItems.dataHeader.value(1)).toBe('DATABASE_NAME');
+        expect(dataItems.dataHeader.value(1)).toBe('<DATABASE_NAME>');
         expect(dataItems.data.value(1)).toBe('drugbank');
 
-        expect(dataItems.dataHeader.value(31)).toBe('SYNONYMS');
+        expect(dataItems.dataHeader.value(2)).toBe('5225 <TEST_FIELD>');
+        expect(dataItems.data.value(2)).toBe('whatever');
+
+        expect(dataItems.dataHeader.value(31)).toBe('<SYNONYMS>');
         expect(dataItems.data.value(31)).toBe('Orthophosphate; Phosphate');
 
         expect(compound1.dataItems.data.value(0)).toBe('0');
         expect(compound2.dataItems.data.value(0)).toBe('1');
 
-        expect(compound3.dataItems.dataHeader.value(2)).toBe('PUBCHEM_CONFORMER_DIVERSEORDER');
+        expect(compound3.dataItems.dataHeader.value(2)).toBe('<PUBCHEM_CONFORMER_DIVERSEORDER>');
         expect(compound3.dataItems.data.value(2)).toBe('1\n11\n10\n3\n15\n17\n13\n5\n16\n7\n14\n9\n8\n4\n18\n6\n12\n2');
 
-        expect(compound3.dataItems.dataHeader.value(21)).toBe('PUBCHEM_COORDINATE_TYPE');
+        expect(compound3.dataItems.dataHeader.value(21)).toBe('<PUBCHEM_COORDINATE_TYPE>');
         expect(compound3.dataItems.data.value(21)).toBe('2\n5\n10');
     });
 });

+ 4 - 3
src/mol-io/reader/sdf/parser.ts

@@ -26,6 +26,7 @@ export interface SdfFile {
     readonly compounds: SdfFileCompound[]
 }
 
+
 const delimiter = '$$$$';
 function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, data: Column<string> } {
     const dataHeader = TokenBuilder.create(tokenizer.data, 32);
@@ -36,8 +37,8 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da
         if (line.startsWith(delimiter)) break;
         if (!line) continue;
 
-        if (line.startsWith('> <')) {
-            TokenBuilder.add(dataHeader, tokenizer.tokenStart + 3, tokenizer.tokenEnd - 1);
+        if (line.startsWith('> ')) {
+            TokenBuilder.add(dataHeader, tokenizer.tokenStart + 2, tokenizer.tokenEnd);
 
             Tokenizer.markLine(tokenizer);
             const start = tokenizer.tokenStart;
@@ -45,7 +46,7 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da
             let added = false;
             while (tokenizer.position < tokenizer.length) {
                 const line2 = Tokenizer.readLine(tokenizer);
-                if (!line2 || line2.startsWith(delimiter) || line2.startsWith('> <')) {
+                if (!line2 || line2.startsWith(delimiter) || line2.startsWith('> ')) {
                     TokenBuilder.add(data, start, end);
                     added = true;
                     break;