📄 parsedict.py
字号:
#! /usr/bin/env python
"""
Let's write our own python parser to clean up the pdf (after
pdftotext of course).
Instructions: run pdftotext like this:
$ pdftotext -f 9 -l 81 -raw -nopgbrk 04_06PU.PDF 04_06PU-3.txt
then run the python parser like this:
$ python ParseDict.py 04_06PU.txt dicomV3.dic
"""
import re,os
"""
PdfTextParser takes as input a text file (produced by pdftotext)
and create as output a clean file (ready to be processed) by
DicomV3Expander
Warning: PdfTextParser does not expand:
- (xxxx,xxxx to xxxx) xxxxxxxxxxxx
or
- (12xx, 3456) comment...
"""
class PdfTextParser:
# Cstor
def __init__(self):
self._InputFilename = ''
self._OutputFilename = ''
self._Infile = 0
self._OutLines = []
self._PreviousBuffers = []
def SetInputFileName(self,s):
self._InputFilename = s
def SetOutputFileName(self,s):
self._OutputFilename = s
# Function returning if s is a comment for sure
def IsAComment(self,s):
#print s, len(s)
if s == "Tag Name VR VM":
return True
elif s == "PS 3.6-2003":
return True
elif s == "PS 3.6-2004":
return True
patt = re.compile('^Page [0-9]+$')
if( patt.match(s) ):
return True
return False
def IsAStartingLine(self,s):
patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*)$')
if( patt.match(s) ):
return True
return False
def IsAFullLine(self,s):
patt = re.compile('^\\([0-9a-fA-Fx]+,[0-9a-fA-F]+\\) (.*) [A-Z][A-Z] [0-9]$')
if( patt.match(s) ):
return True
return False
# FIXME this function could be avoided...
def IsSuspicious(self,s):
l = len(s)
if l > 80:
return True
return False
def AddOutputLine(self,s):
assert not self.IsAComment(s)
self._OutLines.append(s + '\n')
def Open(self):
self._Infile = file(self._InputFilename, 'r')
for line in self._Infile.readlines():
line = line[:-1] # remove '\n'
if not self.IsAComment( line ):
if self.IsAStartingLine(line):
#print "Previous buffer:",self._PreviousBuffers
previousbuffer = ' '.join(self._PreviousBuffers)
if self.IsAStartingLine(previousbuffer):
if not self.IsSuspicious(previousbuffer):
self.AddOutputLine(previousbuffer)
else:
# this case should not happen if I were to rewrite the
# thing I should be able to clean that
#print "Suspicious:", previousbuffer
#print "List is:", self._PreviousBuffers
s = self._PreviousBuffers[0]
if self.IsAFullLine(s):
# That means we have a weird line that does not start
# as usual (xxxx,xxxx) therefore we tried constructing
# a buffer using a the complete previous line...
#print "Full line:", s
self.AddOutputLine(s)
s2 = ' '.join(self._PreviousBuffers[1:])
#print "Other Full line:", s2
self.AddOutputLine(s2)
else:
# we have a suspicioulsy long line, so what that could
# happen, let's check:
if self.IsAFullLine(previousbuffer):
self.AddOutputLine(previousbuffer)
else:
# This is the only case where we do not add
# previousbuffer to the _OutLines
print "Suspicious and Not a full line:", s
else:
if previousbuffer:
print "Not a buffer:", previousbuffer
# We can clean buffer, since only the case 'suspicious' +
# 'Not a full line' has not added buffer to the list
self._PreviousBuffers = []
# In all cases save the line for potentially growing this line
assert not self.IsAComment(line)
self._PreviousBuffers.append(line)
else:
#print "Not a line",line
assert not self.IsAComment(line)
self._PreviousBuffers.append(line)
else:
#print "Comment:",line
previousbuffer = ' '.join(self._PreviousBuffers)
if previousbuffer and self.IsAStartingLine(previousbuffer):
#print "This line is added:", previousbuffer
self.AddOutputLine( previousbuffer )
else:
#print "Line is comment:", line
print "Buffer is:", previousbuffer
# Ok this is a comment we can safely clean the buffer:
self._PreviousBuffers = []
self.Write()
def Write(self):
outfile = file(self._OutputFilename, 'w')
outfile.writelines( self._OutLines )
outfile.close()
self._Infile.close()
# Main function to call for parsing
def Parse(self):
self.Open()
"""
subclass
"""
class UIDParser(PdfTextParser):
def IsAStartingLine(self,s):
patt = re.compile('^1.2.840.10008.[0-9.]+ (.*)$')
if( patt.match(s) ):
return True
#print "Is Not:", s
return False
def IsAFullLine(self,s):
patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) PS ?[0-9].1?[0-9]$')
if( patt.match(s) ):
return True
patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) Well-known frame of reference$')
if( patt.match(s) ):
return True
patt = re.compile('^1.2.840.10008.[0-9.]+ (.*) \\(Retired\\)$')
if( patt.match(s) ):
return True
return False
def IsAComment(self,s):
if PdfTextParser.IsAComment(self,s):
return True
# else let's enhance the super class
patt = re.compile('^SPM2 (.*) http(.*)$')
if( patt.match(s) ):
return True
return False
def AddOutputLine(self,s):
if self.IsAFullLine(s):
return PdfTextParser.AddOutputLine(self,s)
print "Discarding:", s
"""
TransferSyntaxParser
"""
class TransferSyntaxParser(UIDParser):
def IsAFullLine(self,s):
patt = re.compile('^(.*) Transfer Syntax PS ?[0-9].1?[0-9]$')
if patt.match(s):
return UIDParser.IsAStartingLine(self,s)
print "Not a TS:", s
return False
"""
Papyrus parser
pdftotext -f 19 -l 41 -raw -nopgbrk /tmp/Papyrus31Specif.pdf /tmp/Papyrus31Specif.txt
I need to do a second pass for pages:
#29 since I need to find [0-9.]+
#40,41 since it start with number in two columns !!
"""
class PapyrusParser(PdfTextParser):
def __init__(self):
self._PreviousPage = 0
self._PreviousNumber = 0
PdfTextParser.__init__(self)
def IsAStartingLine(self,s):
patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
if( patt.match(s) ):
return True
# After page 39, lines are like:
patt = re.compile('^[0-9x]+ [0-9xA-F]+ .*$')
if( patt.match(s) ):
#print "PAge 39", s
return True
return False
def IsAFullLine(self,s):
patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
if( patt.match(s) ):
return True
# After page 39, lines are like:
patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9].*$')
if( patt.match(s) ):
#print "PAge 39", s
return True
return False
def IsAComment(self,s):
# dummy case:
if s == 'Attribute Name Tag Type Attribute Description':
#print "Dummy", s
return True
patt = re.compile('^.*ANNEXE.*$')
if patt.match(s):
return True
# Indicate page #, spaces ending with only one number
# Sometime there is a line with only one number, we need to
# make sure that page # is strictly increasing
patt = re.compile('^[1-9][0-9]+$')
if( patt.match(s) ):
p = eval(s)
if( p > self._PreviousPage):
#print "Page #", p
self._PreviousNumber = 0
self._PreviousPage = p
return True
# else:
# print "PAGE ERROR:", s
# Now within each page there is a comment that start with a #
# let's do the page approach wich reset at each page
patt = re.compile('^[0-9]+$')
if( patt.match(s) ):
if( eval(s) > self._PreviousNumber):
#print "Number #", eval(s)
self._PreviousNumber = eval(s)
return True
#else:
# print "ERROR:", s
return False
def AddOutputLine(self,s):
assert not self.IsAComment(s)
s = s.replace('\n','')
#print "REMOVE return:", s
patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) ([0-9C]+) (.*)$')
m = patt.match(s)
ss = 'dummy (0000,0000) 0'
if m:
ss = m.group(2) + ' ' + m.group(3) + ' ' + m.group(1)
else:
patt = re.compile('^([A-Za-z \'\(\)]+) (\\([0-9A-F]+,[0-9A-F]+\\)) (.*)$')
m = patt.match(s)
if m:
ss = m.group(2) + ' 0 ' + m.group(1)
else:
ss = s
# There is two case one that end with all capital letter
# explaining the 'DEFINED TERMS'
patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9] [A-Z, ]$')
#patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [0-9]|1\\-n [A-Z, |3.0]+$')
#patt = re.compile('^[0-9x]+ [0-9xA-F]+ .* [A-Z][A-Z] [01n-] [A-Z, |3.0]+$')
if patt.match(s):
print "Match", s
ss = ''
self._OutLines.append(ss + '\n')
def Open(self):
self._Infile = file(self._InputFilename, 'r')
for line in self._Infile.readlines():
line = line[:-1] # remove '\n'
if not self.IsAComment( line ):
if self.IsAStartingLine(line):
#print "Previous buffer:",self._PreviousBuffers
previousbuffer = ' '.join(self._PreviousBuffers)
if self.IsAFullLine(previousbuffer):
self.AddOutputLine(previousbuffer)
else:
if previousbuffer:
print "Not a buffer:", previousbuffer
# We can clean buffer, since only the case 'suspicious' +
# 'Not a full line' has not added buffer to the list
self._PreviousBuffers = []
# In all cases save the line for potentially growing this line
# just to be safe remove any white space at begining of string
assert not self.IsAComment(line)
self._PreviousBuffers.append(line.strip())
else:
#print "Not a line",line
assert not self.IsAComment(line)
# just to be safe remove any white space at begining of string
self._PreviousBuffers.append(line.strip())
else:
#print "Previous buffer:",self._PreviousBuffers
previousbuffer = ' '.join(self._PreviousBuffers)
if previousbuffer and self.IsAStartingLine(previousbuffer):
#print "This line is added:", previousbuffer
self.AddOutputLine( previousbuffer )
# else:
# #print "Line is comment:", line
# print "Buffer is:", previousbuffer
# Ok this is a comment we can safely clean the buffer:
self._PreviousBuffers = []
self.Write()
"""
Parser for:
GE Medical Systems HISPEED ADVANTAGE CT/i CONFORMANCE STATEMENT
pdftotext -f 81 -l 90 -raw -nopgbrk 2162114_100r5.pdf 2162114_100r5.txt
"""
class GEMSParser(PdfTextParser):
# def __init__(self):
# PdfTextParser.__init__(self)
def IsAStartingLine(self,s):
#patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
patt = re.compile('^[A-Za-z0-9 .#(),_/-]+ +\\([0-9A-F]+, ?[0-9A-F]+\\) +(.*)$')
if( patt.match(s) ):
return True
return False
def IsAFullLine(self,s):
#patt = re.compile('^[A-Za-z \'\(\)]+ +\\([0-9A-F]+,[0-9A-F]+\\) +(.*)$')
patt = re.compile('^[A-Za-z0-9 .#(),_/-]+ +\\([0-9A-F]+, ?[0-9A-F]+\\) [A-Z][A-Z] [0-9]+$')
if( patt.match(s) ):
return True
print "Not full:", s
return False
def IsAComment(self,s):
if PdfTextParser.IsAComment(self,s):
return True
#patt = re.compile('^.*GE Medical Systems LightSpeed QX/i CONFORMANCE STATEMENT REV 2.2 sm 2288567-100.*$')
#if patt.match(s):
# return True
patt = re.compile('^.*GE Medical Systems HISPEED ADVANTAGE CT/i CONFORMANCE STATEMENT.*$')
if patt.match(s):
return True
patt = re.compile('^GE Medical Systems LightSpeed QX/i CONFORMANCE STATEMENT.*$')
if patt.match(s):
return True
patt = re.compile('^Attribute Name Tag VR VM$')
if patt.match(s):
return True
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -