⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mlist.pro

📁 prolog开发工具
💻 PRO
字号:
%----------------------------------------------------------------------
% MLIST.PRO - parse mailing list information
%
% MLIST reads mailing list labels and extracts various
% fields from each label, such as first name, last name,
% organization, city, state, zip, country, etc.
% This version was used for some of our direct mail
% and analyzes label formats from either Miller Freeman
% publications, such as AIExpert or Dr. Dobbs, and PCAI
% magazine.
%
% In the case of Miller Freeman lists, all the letters
% are upper case.  The program converts these to a correct
% mix of upper and lower case.
%
% For any list, the program creates an error file, SCAN.ERR,
% that contains labels that the program could not make
% sense of.
%
% main/0 is the entry point when the program is used as
% a stand-alone application, reading an input file of label
% information and outputing the parsed fields.  It takes
% as input the name of the label file to process, and
% the type of list, whether Miller Freeman, 'mf', or
% PCAI, 'pcai'.  The output from main/0 is a file, SCAN.LOG.
% that contains the fields.
%
% Run the program with the input file SCAN.INP and type
% mf to see how it works.
%

% list utilities

append([], X, X).
append([A|X], Y, [A|Z]) :- append(X,Y,Z).

member(A, [A|_]).
member(A, [_|Z]) :- member(A, Z).

reverse(A, Z) :- reverse(A, [], Z).

   reverse([], Z, Z).
   reverse([A|X], SoFar, Z) :- reverse(X, [A|SoFar], Z).

% main

main :-
  write($File to process: $),
  read_string(FName),
  write($File type [mf, pcai]: $),
  read_string(FType),
  string_atom(FType, FTa),
  ml_init(FTa, FName),
  repeat,
  ml_getrecord(R),
  R = eof,
  ml_close.

% The three predicates beginning with ml_ are designed to
% provide an interface to other applications that use this
% Prolog program as a service.  For the application it
% was designed for, the Prolog module is called from an
% Access DB application that reads the mailing list
% labels and saves the parsed information in the DB.
%
% ml_init(Type, Fin) - Type is mf or pcai, for the list
%    type, and Fin is the input file.  Called once at
%    beginning of run.
% ml_close() - Called once at end of run, to close the
%    files.
% ml_getrecord(Record) - called in a loop until the
%    end-of-file is read.  Record is a Prolog structure
%    with the database fields derived from the address
%    label.

ml_init(Type, Fin) :-
  retractall(ftype(_)),
  retractall(files(_,_,_)),
  assert(ftype(Type)),
  fopen(Hin, Fin, r),
  fopen(Herr, $scan.err$, w),
  fopen(Hlog, $scan.log$, w),
  assert(files(Hin, Hlog, Herr)).

ml_close :-
  files(Hin, Hlog, Herr),
  fclose(Hin),
  fclose(Hlog),
  fclose(Herr).

ml_getrecord(Record) :-
  files(Hin, Hlog, _),
  get_parse(Hin, ParseList),
  list_record(ParseList, Record),
  write_record(Hlog, Record),
  !.

% get_parse/2 reads a block of text that is a label,
% and converts it to a list of fields.  The list
% fields is of the form
%   [last('Campbell'), first(['Peter', 'L.']),
%    title(['Director', 'MIS']), ... region('UT')..]

get_parse(Hin, ParseList) :-
  read_block(Hin, TB), !,
  process_block(TB, ParseList).

% write_record/2 is used by the stand-alone version of the
% program.  It writes the fields of the record to the
% file SCAN.OUT.  This step could be kept in an embedded
% version for diagnostics, but in general the fields in
% the record would be entered in a database.

write_record(H, eof) :- !.
write_record(H, record(L,F,T,O,A1,A2,C,R,P,Ct,Tel,Fax)) :-
  write(adding:L), nl,
  write(H, last:L), nl(H),
  write(H, first:F), nl(H),
  write(H, title:T), nl(H),
  write(H, org:O), nl(H),
  write(H, addr1:A1), nl(H),
  write(H, addr2:A2), nl(H),
  write(H, city:C), nl(H),
  write(H, region:R), nl(H),
  write(H, post:P), nl(H),
  write(H, country:Ct), nl(H),
  write(H, tel:Tel), nl(H),
  write(H, fax:Fax), nl(H),
  nl(H).

% get_parse/2, described above, derives a list of fields
% from the input label information.  The list of fields
% is not that useful a format, although easy to build, so
% this code takes those lists and converts them into
% a record format with a fixed number of fields, where
% each field is a simple string, easily digested by a
% database application.

list_record(eof, eof).
list_record(L, record(Last, First, Title,
    Org, Addr1, Addr2, City, Region, Post, Country, Tel, Fax)) :-
  get_field(L, last, Last),
  get_field(L, first, First),
  get_field(L, title, Title),
  get_field(L, org, Org),
  get_field(L, addr1, Addr1),
  get_field(L, addr2, Addr2),
  get_field(L, city, City),
  get_field(L, region, Region),
  get_field(L, post, Post),
  get_field(L, country, Country),
  get_field(L, tel, Tel),
  get_field(L, fax, Fax).

get_field(L, F, Vs) :-
  X =.. [F,Vt],
  member(X, L),
  !,
  field_string(Vt, Vs).
get_field(L, _, $$).
  
field_string(L, Vs) :-
  list(L),
  flatten(L, [H|Lflat]),
  string_atom(S, H),
  atomlist_string(Lflat, S, Vs).
field_string(F, Vs) :-
  string_term(Vs, F).

atomlist_string([], S, S).
atomlist_string([H|T], SoFar, Ans) :-
  string_atom(S, H),
  strcat(SoFar, $ $, X),
  strcat(X, S, Next),
  atomlist_string(T, Next, Ans).

% process_block/2 is a simple front end on parse_block
% that deals with the cases of end-of-file and un-parsed
% labels, as well as ones that successfully parse.

process_block(eof, eof).
process_block(TB, Parse) :-
  parse_block(TB, Parse),
  !.
process_block(TB, _) :-
  files(_, _, Herr),
  write(Herr, $\n***** Failed to Parse:\n$),
  write_block(Herr, TB),
  !, fail.

% read_block
%   A text block is a list of strings, where each string is
%   a line from the input file.  Return 'eof' if end of file,
%   also skip over multiple blank lines between blocks
%   of text.

read_block(H, TB) :-
  read_string(H, FirstS),
  process_first_line(H, FirstS, TB),
  !,
  files(_, Hlog, _),
  write_block(Hlog, TB).

process_first_line(_, '!EOF', eof) :- !.
process_first_line(H, FirstS, TB) :-
  nonblank_string(FirstS),
  read_lines(H, FirstS, [], TB).
process_first_line(H, _, TB) :-
  read_string(H, FirstS),
  process_first_line(H, FirstS, TB).

% read_lines
%  Read input lines as strings until either an empty line
%  or the end of file is reached.  The list of input lines
%  is constructed by adding each new line to the head
%  of the forming list, so the list is in reverse order,
%  which is what we want because the parsing is going
%  to proceed from the last item, looking for address,
%  to the first.

read_lines(_, '!EOF', TB, TB).
read_lines(H, S, SoFar, TB) :-
  nonblank_string(S),
  read_string(H, Snext),
  read_lines(H, Snext, [S|SoFar], TB).
read_lines(_, _, TB, TB).

% write_block
%   Write a text block, simply echo the list of strings in the
%   block to the output device.  Used to report errors.

write_block(H, X) :- atomic(X), writeq(H, X), nl(H), !.
write_block(H, TB) :- reverse(TB, TBf), write_blk(H, TBf), nl(H).

write_blk(H, []).
write_blk(H, [X|Z]) :-
  writeq(H, X), nl(H),
  write_blk(H, Z).

% Parse the block of input lines using a two tiered DCG.
%
% The first tier is concerned with lines, and the input is
% a list of strings.  The DCG uses position in the list
% to help determine what type of information is in each
% string.  DCG rules working the list of strings are
% prefaced with 'ps_'.
%
% The second tier is concerned with characters within
% the lines.  Each line is converted to a list of characters
% and passed to second tier DCG rules to extract the
% information from that line.  The top predicates of each
% DCG rule that works on character lists is prefaced
% with the 'p_'.  Helper rules might have any name.

parse_block(TB, Parse) :-
  ftype(FT),
  parse(FT, Parse, TB, []).

% A Miller Freeman list might have phone information first
% (which is really last as the lines are treated in reverse
% order), whereas PCAI lists do not have phones.

parse(mf, P) -->
  ps_phones(Ph), !,
  ps_addrname(mf, AN),
  {append(Ph, AN, P)}.
parse(pcai, P) -->
  ps_addrname(pcai, P).

% setcontext/1 is used to save context information for use
% in interpreting various abbreviations.

setcontext(X) :-
  retractall(context(_)),
  assert(context(X)).

% parse phone numbers, either in North American format or
% as a list of digits for other continents.

ps_phones([P|T]) -->
  [X],
  {string_list(X, L),
   p_phone(P, L, [])},
   ps_phones(T).
ps_phones([]) --> [].

p_phone(tel(P)) --> spaces,  "TEL", spaces, p_phnum(P).
p_phone(fax(P)) --> spaces,  "FAX", spaces, p_phnum(P).

p_phnum(A/E-N) -->
  areacode(A),
  exchange(E),
  number(N).
p_phnum(P) -->
  phoneword(P).

areacode(AC) -->
  "(", digit(A), digit(B), digit(C), ")", {name(AC, [A,B,C])}.
exchange(E) -->
  spaces, digit(A), digit(B), digit(C), "-", {name(E, [A,B,C])}.
number(N) -->
  digit(A), digit(B), digit(C), digit(D), {name(N, [A,B,C,D])}.

phoneword(W) -->
  spaces,
  phonechar(X),
  phonechars(Y),
  {name(W, [X|Y])}.

phonechars([X|Y]) --> phonechar(X), phonechars(Y).
phonechars([]) --> [].

phonechar(X) --> [X], {numb(X)}.
phonechar(X) --> [X], {member(X, [` , `-, `(, `), `/])}.

% parse geographic area information

ps_addrname(LType, AN) -->
  ps_usarea(A), !,
  ps_rest(LType, R),
  {append(A, R, AN)}.
ps_addrname(LType, AN) -->
  ps_country(C),
  ps_intarea(A2), !,
  ps_rest(LType, R),
  {append([country(C)|A2], R, AN)}.
ps_addrname(_, []) --> [].

ps_usarea(A) -->
  [X],
  {string_list(X, L),
   p_usarea(A, L, [])}.

p_usarea([city(C), region(S), post(Z)]) -->
  city(C),
  state(S),
  uszip(Z),
  spaces.

ps_intarea(A) -->
  [X],
  {string_list(X, L),
   p_intarea(A, L, [])}.

p_intarea([city(C)]) -->
  city(C),
  spaces.
p_intarea([city(C), post(P)]) -->
  city(C),
  post(P),
  spaces.
p_intarea([city(C)]) -->
  anycity(C),
  spaces.

city(C) --> placewords(C).

anycity(C) --> words(C).

state(S) --> upword(S). 

uszip(Z) -->
  spaces,digit(A),digit(B),digit(C),digit(D),digit(E),
  {name(Z,[A,B,C,D,E])}.
uszip(Z) -->
  spaces,digit(A),digit(B),digit(C),digit(D),digit(E),
  "-",digit(F),digit(G),digit(H),digit(I),
  {name(Z, [A,B,C,D,E,`-,F,G,H,I])}.
   
post(P) --> postwords(P).

ps_country(C) -->
  [X],
  {string_list(X,L),
   p_country(C, L, [])}.

p_country(C) --> upwords(C), spaces.

% Parse remaining information, where, depending on the
% type of list and number of lines remaining, might
% be of many forms.

ps_rest(mf, [addr1(A1), org(O), title(T) | N]) -->
  ps_addr(A1),
  ps_org(O),
  ps_title(T),
  ps_names(N).
ps_rest(mf, [addr1(A1) | N]) -->
  ps_addr(A1),
  ps_names(N).
ps_rest(mf, [addr1(A1), org(O) | N]) -->
  ps_addr(A1),
  ps_org(O),
  ps_names(N).
ps_rest(mf, [addr1(A1), addr2(A2), org(O), title(T) | N]) -->
  ps_addr(A2),
  ps_addr(A1),
  ps_org(O),
  ps_title(T),
  ps_names(N).
ps_rest(pcai, [addr1(A1), org(O) | N]) -->
  ps_addr(A1),
  ps_names(N),
  ps_org(O).
ps_rest(pcai, [addr1(A1) | N]) -->
  ps_addr(A1),
  ps_names(N).

% Rules for address lines, organizations, titles
% and names.

ps_addr(A) -->
  [X],
  {string_list(X, L),
   p_addr(A, L, [])}.

p_addr(A) --> {setcontext(addr)},
  words(A), spaces.

ps_org(O) -->
  [X],
  {string_list(X, L),
   p_org(O, L, [])}.

p_org(O) --> {setcontext(org)}, words(O), spaces.

ps_title(T) -->
  [X],
  {string_list(X, L),
   p_title(T, L, [])}.

p_title(T) --> {setcontext(title)}, words(T), spaces.

ps_names([last(LN), first(FN)]) -->
  [X],
  {string_list(X, L),
   p_names(LN, FN, L, [])}.

p_names(L, F) --> {setcontext(name)},
  first(F), last(L), spaces.

first(F) --> words(F).
last(L) --> word(L).

% Lowest level character parsing rules, used to parse
% a line, which is at this point a list of characters,
% into a list of words.  The lists of characters are
% grouped into syntactic chunks which are converted
% to atoms by the name/2 built-in predicate.

words([X|Y]) --> spaces, word(X), words(Y).
words([]) --> [].

word(W) --> spaces, [X], {char(X)}, wchars(Chs),
  {uplow([X|Chs], Z), name(W,Z)}.

upwords([X|Y]) --> spaces, upword(X), upwords(Y).
upwords([]) --> [].

upword(W) --> spaces, [X], {char(X)}, wchars(Chs), {name(W,[X|Chs])}.

postwords([X|Y]) --> spaces, postword(X), postwords(Y).
postwords([]) --> [].

postword(W) --> spaces, [X], {char(X)}, wchars(Chs),
  {hasdigit([X|Chs]), name(W, [X|Chs])}.

placewords([X|Y]) --> spaces, placeword(X), placewords(Y).
placewords([]) --> [].

placeword(W) --> spaces, [X], {char(X)}, wchars(Chs),
  {not(hasdigit([X|Chs])), uplow([X|Chs], Z), name(W,Z)}.

wchars([X|Y]) --> [X], {char(X)}, wchars(Y), !.
wchars([]) --> [].

hasdigit([]) :- !, fail.
hasdigit([X|Y]) :- numb(X), !.
hasdigit([X|Y]) :- hasdigit(Y).

digit(X) --> [X], {X >= `0, X =< `9}.

char(X) :- letter(X), !.
char(X) :- numb(X), !.
char(X) :- member(X,[`', `., `-, `/, `#, `&]), !.

letter(X) :- X >= `a, X =< `z, !.
letter(X) :- X >= `A, X =< `Z, !.

numb(X) :- X >= `0, X =< `9.

spaces --> space, spaces, !.
spaces --> [].

space --> [X], {X =< 32}.
space --> ",".

lf --> [10].

% Convert upper case words to initial cap and lower case
% when we have a Miller Freeman mailing list.  Preserve
% certain forms as all caps, and add punctuation to some
% abbreviations.

uplow(X, X) :- not(ftype(mf)), !.
uplow(X, X) :- not(allletters(X)), !.
uplow(X, Y) :- wired(X, Y), !.
uplow([C|U], [C|L]) :- ullist(U,L).

allletters([]).
allletters([H|T]) :-
  (letter(H); member(H, [`-, `'])),
  !,
  allletters(T).

ullist([], []).
ullist([U|X], [L|Y]) :-
  ulchar(U,L),
  ullist(X,Y).

ulchar(U,L) :-
  Offset is U - `A,
  Offset >= 0,
  Offset < 26,
  !,
  L is `a + Offset.
ulchar(X,X).

wired("OF", "of").
wired("DU", "du").
wired("DE", "de").
wired(X, Y) :-
  context(name), !,
  wname(X, Y).
wired(X, Y) :-
  context(addr), !,
  waddr(X, Y).
wired(X, Y) :-
  context(title), !,
  wtitle(X, Y).
wired(X, Y) :-
  context(org), !,
  worg(X,Y).

wname([X], [X, `.]).
wname("DR", "Dr.").
wname("PHD", "PhD.").
wname("MS", "Ms.").
wname("MR", "Mr.").
wname("JR", "Jr.").

wtitle("GM", "G.M.").
wtitle("MGR", "Mgr.").
wtitle("SW", "SW").
wtitle("IT", "IT").
wtitle("QA", "QA").
wtitle("SE", "SE").
wtitle("TECH", "Tech.").
wtitle("DIR", "Dir.").
wtitle("CEO", "C.E.O.").
wtitle("PRES", "Pres.").
wtitle("ASST", "Asst.").
wtitle("SR", "Sr.").
wtitle("MIS", "MIS").
wtitle("VP", "V.P.").

waddr("PO", "P.O.").
waddr("NW", "NW").
waddr("SE", "SE").
waddr("NE", "NE").
waddr("SW", "SW").
waddr("ST", "St.").
waddr("RD", "Rd.").
waddr("AVE", "Ave.").
waddr("DR", "Dr.").
waddr("CT", "Ct.").
waddr("CIR", "Cir.").
waddr("BLDG", "Bldg.").
waddr("RM", "Rm.").

worg([X], [X, `.]).
worg("USAF", "USAF").
worg("AFB", "AFB").
worg("INC", "Inc.").
worg("LTD", "Ltd.").
worg("CORP", "Corp.").
worg("CO", "Co.").

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -