anagram.c

来自「linux下基于c++的处理器仿真平台。具有处理器流水线」· C语言代码 · 共 648 行 · 第 1/2 页
648 行
/* *  Anagram program by Raymond Chen, *  inspired by a similar program by Brian Scearce * *  This program is Copyright 1991 by Raymond Chen. *              (rjc@math.princeton.edu) * *  This program may be freely distributed provided all alterations *  to the original are clearly indicated as such. *//* There are two tricks.  First is the Basic Idea: * * When the user types in a phrase, the phrase is first preprocessed to * determine how many of each letter appears.  A bit field is then constructed * dynamically, such that each field is large enough to hold the next power * of two larger than the number of times the character appears.  For example, * if the phrase is hello, world, the bit field would be * *   00 00 00 000 000 00 00 *    d e   h   l   o  r  w * * The phrase hello, world, itself would be encoded as * *   01 01 01 011 010 01 01 *    d e   h   l   o  r  w * * and the word hollow would be encoded as * *   00 00 01 010 010 00 01 *    d  e  h   l   o  r  w * * The top bit of each field is set in a special value called the sign. * Here, the sign would be * *   10 10 10 100 100 10 10 *    d  e  h   l   o  r  w * * The reason for packing the values into a bit field is that the operation * of subtracting out the letters of a word from the current phrase can be * carried out in parallel.  for example, subtracting the word hello from * the phrase hello, world, is merely * *    d e   h   l   o  r  w *   01 01 01 011 010 01 01 (dehllloorw) * - 00 00 01 010 010 00 01 (hlloow) * ======================== *   01 01 00 001 000 01 00 (delr) * * Since none of the sign bits is set, the word fits, and we can continue. * Suppose the next word we tried was hood. * *    d e   h   l   o  r  w *   01 01 00 001 000 01 00 (delr) * - 01 00 01 000 010 00 00 (hood) * ======================== *   00 00 11 000 110 01 00 *         ^      ^ * A sign bit is set.  (Two, actually.)  This means that hood does not * fit in delr, so we skip it and try another word.  (Observe that * when a sign bit becomes set, it screws up the values for the letters to * the left of that bit, but that's not important.) * * The inner loop of an anagram program is testing to see if a * word fits in the collection of untried letters.  Traditional methods * keep an array of 26 integers, which are then compared in turn.  This * means that there are 26 comparisons per word. * * This method reduces the number of comparisons to MAX_QUAD, typically 2. * Instead of looping through an array, we merely perform the indicated * subtraction and test if any of the sign bits is set. *//* The nuts and bolts: * * The dictionary is loaded and preprocessed.  The preprocessed dictionary * is a concatenation of copies of the structure: * * struct dictword { *     char bStructureSize;             -- size of this structure *     char cLetters;                   -- number of letters in the word *     char achWord[];                  -- the word itself (0-terminated) * } * * Since this is a variable-sized structure, we keep its size in the structure * itself for rapid stepping through the table. * * When a phrase is typed in, it is first preprocessed as described in the * Basic Idea.  We then go through the dictionary, testing each word.  If * the word fits in our phrase, we build the bit field for its frequency * table and add it to the list of candidates. *//* * The Second Trick: * * Before diving into our anagram search, we then tabulate how many times * each letter appears in our list of candidates, and sort the table, with * the rarest letter first. * * We then do our anagram search. * * Like most anagram programs, this program does a depth-first search. * Although most anagram programs do some sort of heuristics to decide what * order to place words in the list_of_candidates, the search itself proceeds * according to a greedy algorithm.  That is, once you find a word that fits, * subtract it and recurse. * * This anagram program exercises some restraint and does not march down * every branch that shows itself.  Instead, it only goes down branches * that use the rarest unused letter.  This helps to find dead ends faster. * * FindAnagram(unused_letters, list_of_candidates) { *  l = the rarest letter as yet unused *  For word in list_of_candidates { *     if word does not fit in unused_letters, go on to the next word. *     if word does not contain l, defer. *      FindAnagram(unused_letters - word, list_of_candidates[word,...]) *  } * } * * * The heuristic of the Second Trick can probably be improved.  I invite * anyone willing to improve it to do so. *//* Use the accompanying unproto perl script to remove the ANSI-style * prototypes, for those of you who have K&R compilers. */#include <stdio.h>#include <stdlib.h>#include <ctype.h>#include <sys/types.h>#include <sys/stat.h>#include <setjmp.h>/* Before compiling, make sure Quad and MASK_BITS are set properly.  For best * results, make Quad the largest integer size supported on your machine. * So if your machine has long longs, make Quad an unsigned long long. * (I called it Quad because on most machines, the largest integer size * supported is a four-byte unsigned long.) * * If you need to be able to anagram larger phrases, increase MAX_QUADS. * If you increase it beyond 4, you'll have to add a few more loop unrolling * steps to FindAnagram. */typedef unsigned long Quad;             /* for building our bit mask */#define MASK_BITS 32                    /* number of bits in a Quad */#define MAX_QUADS 2                     /* controls largest phrase */#define MAXWORDS 26000                  /* dictionary length */#define MAXCAND  5000                   /* candidates */#define MAXSOL   51                     /* words in the solution */#define ALPHABET 26                     /* letters in the alphabet */#define ch2i(ch) ((ch)-'a')             /* convert letter to index */#define i2ch(ch) ((ch)+'a')             /* convert index to letter *//* IBM PC's don't like globs of memory larger than 64K without * special gyrations.  That's where the huges get stuck in.  And the * two types of allocations on an IBM PC need to be handled differently. * * HaltProcessing is called during the anagram search.	If it returns nonzero, * then the search is aborted. * * Cdecl is a macro expanded before the name of all functions that must * use C-style parameter passing.  This lets you change the default parameter * passing style for the other functions. *//* char *malloc(); */#   define huge#   define far#   define StringFormat    "%15s%c"#   define bigmalloc       malloc#   define smallmalloc     malloc#   define smallmallocfail (char *)0#   define HaltProcessing() 0           /* no easy way to interrupt on UNIX */#   define Cdecl/* Code to be used only when debugging lives inside Debug(). * Code to be used only when collecting statistics lives inside Stat(). */#ifdef DEBUG#define Debug(x) x#else#define Debug(x)#endif#ifdef STAT#define Stat(x) x#else#define Stat(x)#endif/* A Word remembers the information about a candidate word. */typedef struct {    Quad aqMask[MAX_QUADS];  /* the word's mask */    char * pchWord;                 /* the word itself */    unsigned cchLength;                 /* letters in the word */} Word;typedef Word * PWord;typedef Word * * PPWord;PWord apwCand[MAXCAND];    /* candidates we've found so far */unsigned cpwCand;                       /* how many of them? *//* A Letter remembers information about each letter in the phrase to be * anagrammed. */typedef struct {    unsigned uFrequency;                /* how many times it appears */    unsigned uShift;                    /* how to mask */    unsigned uBits;                     /* the bit mask itself */    unsigned iq;                        /* which Quad to inspect? */} Letter;typedef Letter * PLetter;Letter alPhrase[ALPHABET]; /* statistics on the current phrase */#define lPhrase(ch) alPhrase[ch2i(ch)]  /* quick access to a letter */int cchPhraseLength;                    /* number of letters in phrase */Quad aqMainMask[MAX_QUADS];/* the bit field for the full phrase */Quad aqMainSign[MAX_QUADS];/* where the sign bits are */int cchMinLength = 3;/* auGlobalFrequency counts the number of times each letter appears, summed * over all candidate words.  This is used to decide which letter to attack * first. */unsigned auGlobalFrequency[ALPHABET];char achByFrequency[ALPHABET];          /* for sorting */char * pchDictionary;               /* the dictionary is read here */#define Zero(t) memset(t, 0, sizeof(t)) /* quickly zero out an integer array *//* Fatal -- print a message before expiring */void Fatal(char *pchMsg, unsigned u) {    fprintf(stdout, pchMsg, u);    exit(1);}/* ReadDict -- read the dictionary file into memory and preprocess it * * A word of length cch in the dictionary is encoded as follows: * *    byte 0    = cch + 3 *    byte 1    = number of letters in the word *    byte 2... = the word itself, null-terminated * * Observe that cch+3 is the length of the total encoding.  These * byte streams are concatenated, and terminated with a 0. */void ReadDict(char *pchFile) {    FILE *fp;    char * pch;    char * pchBase;    unsigned long ulLen;    unsigned cWords = 0;    unsigned cLetters;    int ch;    struct stat statBuf;    if (stat(pchFile, &statBuf)) Fatal("Cannot stat dictionary\n", 0);    ulLen = statBuf.st_size + 2 * (unsigned long)MAXWORDS;    pchBase = pchDictionary = (char *)malloc(ulLen);    if(pchDictionary == NULL)	Fatal("Unable to allocate memory for dictionary\n", 0);    if ((fp = fopen(pchFile, "r")) == NULL)	Fatal("Cannot open dictionary\n", 0);    while (!feof(fp)) {        pch = pchBase+2;                /* reserve for length */        cLetters = 0;        while ((ch = fgetc(fp)) != '\n' && ch != EOF) {            if (isalpha(ch)) cLetters++;            *pch++ = ch;        }        *pch++ = '\0';        *pchBase = pch - pchBase;        pchBase[1] = cLetters;        pchBase = pch;        cWords++;    }    fclose(fp);    *pchBase++ = 0;    fprintf(stdout, "main dictionary has %u entries\n", cWords);    if (cWords >= MAXWORDS)	Fatal("Dictionary too large; increase MAXWORDS\n", 0);    fprintf(stdout, "%lu bytes wasted\n", ulLen - (pchBase - pchDictionary));}void BuildMask(char * pchPhrase) {    int i;    int ch;    unsigned iq;                        /* which Quad? */    int cbtUsed;                        /* bits used in the current Quad */    int cbtNeed;                        /* bits needed for current letter */    Quad qNeed;                         /* used to build the mask */    bzero(alPhrase, sizeof(Letter)*ALPHABET);    bzero(aqMainMask, sizeof(Quad)*MAX_QUADS);    bzero(aqMainSign, sizeof(Quad)*MAX_QUADS);/*    Zero(alPhrase);    Zero(aqMainMask);    Zero(aqMainSign);*/    /* Tabulate letter frequencies in the phrase */
anagram.c - 源码说明

本页面展示了「linux下基于c++的处理器仿真平台。具有处理器流水线」中的 anagram.c 源码文件，采用 C语言编程语言编写，共 648 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与linux相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?