📄 blogdrone.c

📁 blog generator是一个基于Catty 2引擎的工具
💻 C
字号:
// Catty v2// --------// Copyright (C) 2001, 2002 by Michal Zalewski <lcamtuf@coredump.cx>// Highly optimized for lower memory usage to be gentle with coredump.// Speed improvement: we can use e.g. one byte hash// for every word to avoid slow strcmp, but does it make any sense?// We can also add strcmp caching for a given pointer.int cycle;int exitnow;#include <stdio.h>#include <unistd.h>#include <stdlib.h>#include "string-486.h"#include <sys/time.h>#include <ctype.h>#include <fcntl.h>#include <unistd.h>#include <sys/mman.h>#define RND(x)    ((int) (((float)x)*rand()/(RAND_MAX+1.0)))// #define CACHE_DEBUG 1#include "config.h"#include "lang.h"int T_eng;char input[MAXSENT+1][MAXWORD+1][MAXWLEN+1];char* ic[MAXWORD+1];int  ilen[MAXKS+1];int  lscore[MAXKS+1];unsigned int last[MAXKS+1];int longestword=-1;char* (*kbase[MAXKS+1])[];int ktop;struct word {  char* txt;  struct word* next;};#define MEMINCR (10*1024*1024)char* mem;int msize;int mptr;// Rationale for this: we need allocate-and-forget memory. Typical// malloc implementation wastes tons of memory on padding, dynamic// structs etc, which is pretty painful if you create e.g. 100000// 6-byte items - and that's what we do. To minimize this, we organize// them into one-megabyte chunks. // Update: we also use mapped files for storage. Economy mode, almost ;-)inline void* my_malloc(int add);inline char* my_strdup(char* x) {  int add;  add=strlen(x)+1;  return strcpy(my_malloc(add),x);}inline void* my_malloc(int add) {  char* ret;  if (!mem || (add+mptr >= msize)) {    int q;    q=open(".MaP",O_RDWR|O_CREAT,0600);    ftruncate(q,MEMINCR+1);    mem=mmap(0,MEMINCR+1,PROT_READ|PROT_WRITE,MAP_SHARED,q,0);    unlink(".MaP");    close(q);    // printf("F"); fflush(0);    msize=MEMINCR;    mptr=0;  }  ret=(mem+mptr);  mptr+=add;  return ret;}// The idea here is to cache words, so we don't strdup() same word// appearing 100000 times over the database, and we save lots of memory.// This, however, requires very fast lookups, because we're going to make// something to the order of ten million lookups while loading catty. To// do this, we use a quick hashing function for first four characters// to look up the linked list where this word is located (or should be// appended).struct word* wcache[65536];/*   Typical ASCII alnum character has 6 significant      bytes ;-)     Hashing table word:   0123456789ABCDEF                                     AAAAAA                                  BBBBBB    (<<3)                              CCCCCC        (<7)                           DDDDDD           (<<10) */#define HASH(a,b,c,d) ((unsigned short) ( (a) ^ ((b) << 3) ^ \                                          ((c) << 7) ^ ((d) << 10)))#ifdef CACHE_DEBUGint hits, misses;#endifinline char* getcache(unsigned char* x) {  struct word* w;  char c[4];  memcpy(c,x,4);  if (!c[1]) c[2]=c[3]=0; else  if (!c[2]) c[3]=0;  if (!(w=wcache[HASH(c[0],c[1],c[2],c[3])])) {    w=my_malloc(sizeof(struct word));    w->txt=my_strdup(x);    w->next=0;    wcache[HASH(c[0],c[1],c[2],c[3])]=w;#ifdef CACHE_DEBUG    misses++;#endif    return w->txt;  }  while (w->next) {    if (!strcmp(x,w->txt)) #ifdef CACHE_DEBUG    { hits++; return w->txt; }#else    return w->txt;#endif    w=w->next;  }  if (!strcmp(x,w->txt))#ifdef CACHE_DEBUG    { hits++; return w->txt; }#else    return w->txt;#endif  w->next=my_malloc(sizeof(struct word));  w->next->txt=my_strdup(x);  w->next->next=0;#ifdef CACHE_DEBUG    misses++;#endif  return w->next->txt;}// We load the database, splitting phrases into words. This ain't// terribly fast, but reasonable. Smarter approach would simply load// pre-processed binary data from the file, but Catty is not supposed// to be restarted more often than once few weeks, so we can waste a// minute for startup.void load_kbase(void) {  unsigned char buf[1024];  int l,skip;  FILE* f;    skip=(int)getenv("LITEVER");  if (T_eng) {    f=fopen(KENG_FILE,"r");  } else {    f=fopen(KBASE_FILE,"r");  }  if (!f) { perror(KBASE_FILE); exit(1); }  while (fgets(buf,1024,f)) {    char* x=buf,*y;    int cw=0,wcount=0;    if (skip) if (RND(5)) continue;    while (x[0] && strchr(" \t\n\r",*x)) x++;    while ((l=strlen(x)) && strchr("\n\r",x[l-1])) x[l-1]=0;    if (!x[0]) continue;//    if (!(ktop % 5000)) { printf("."); fflush(0); }    y=buf;    while (y) {      y=strchr(y+1,' ');      if (y) wcount++;    }    kbase[ktop]=my_malloc(4*(wcount+1));     while (1) {      int len;      y=strchr(x,' ');      if (!y) break;      *y=0;      len=strlen(x);      if (len>MAXWLEN) { cw=0; break; } // Nonsense, yes      if (len) (*kbase[ktop])[cw++]=getcache(x);      if (cw>=MAXWORD) { cw=0; break; } // Skip this nonsense      x=y+1;    }    (*kbase[ktop])[cw]=0;    if (cw) ktop++;    if (ktop>=MAXKS) {       printf("MAXKS exceeded!\n");      exit(1);     }  }  fclose(f);  // printf("done (%d rants)\n",ktop);#ifdef CACHE_DEBUG  printf("CACHE STATS: %d hits, %d misses.\n",hits,misses);#endif/*  for (l=0;l<20;l++) {    int n;    for (n=0;n<20;n++) {      if (!(*kbase[l])[n]) break;      printf("[%s] ",(*kbase[l])[n]);    }    printf("\n");  }*/}    inline int detect_question(char* where) {  int i;  if (T_eng) {    for (i=0;i<sizeof(engq)/sizeof(char*);i++)      if (!strcmp(where,engq[i])) return 1;  } else {    for (i=0;i<sizeof(question)/sizeof(char*);i++)      if (!strcmp(where,question[i])) return 1;  }  return 0;}int zdania;// Our royal pain ;-) Main algorithm:// ...// For every word A of input//  For every phrase P in the knowledgebase//    Score Increase = 0//    For every word of B of P//      Sequence = 0//      While (B+Sequence == A+Sequence and B+Sequence != B+Sequence-1) //        Sequence++//        Score Increase += Length(A+Sequence)^2//      If Sequence != 0, break loop//    Score[B] += Score Increase^2;// ...// Then, we do some other stuff, like randomizing scores a bit,// lowering scores for phrases that were used recently, and such.// But generally, we pick the best match.char* handle_input(int top) {  static char ret[10000];  int cw=0,c;  int i,tm=0;  int qmark=0;  ret[0]=0;  if (detect_question(input[top][0]) && input[top][1][0]) cw=1;  // Reset linescores  memset(lscore,0,sizeof(lscore));  c=cw;  memset(ic,0,sizeof(ic));    i=0;    while (input[top][i][0]) {      ic[i]=getcache(input[top][i]);      i++;    }  // For each word of input A  while (ic[cw]) {      if (cw==longestword) { cw++; break; }    //     printf("Word %d, 95002 = %d, 1 = %d\n",cw,lscore[95002],lscore[1]);    // For each line of kbase N    for (i=0;i<ktop;i++) {      char lmatch[MAXWORD+2];      int kw=0;      memset(lmatch,0,sizeof(lmatch));      //     For each word B of N      while ((*kbase[i])[kw]) {        //       seq=0        int seq=0;        int matched=0;        static char *f,*prev; // avoid stupid warning        //       while A+seq == B+seq, seq++        while ((f=(*kbase[i])[kw+seq]) && (f==ic[cw+seq])) {           if (cw+seq==longestword) { seq++; break; }          if (!lmatch[cw+seq] && ( !seq || f!=prev)) {            int len=strlen(f);            lmatch[cw+seq]=1;            matched+=/* len* */ len;            seq++;            prev=f;          } else break;        }        //     linescore[N]+=seq*seq        lscore[i]+=matched*matched;        if (seq) break;        kw++;      }    }    cw++;  }  // Pick line with best linescore  for (i=1;i<ktop;i++) {    int wc=0;    if (last[i]) continue;    while ((*kbase[i])[wc]) wc++;    if (wc<4) wc=1; else     if (wc<10) wc=2; else wc=3;    lscore[i]*=wc;    lscore[i]+=RND(lscore[i]);    if (lscore[i] >= lscore[tm]) tm=i;  }  if (!lscore[tm]) {      // Start a new paragraph      if (!exitnow && cycle) {        if (zdania>1) printf("\n\n");        tm=RND(ktop);      } else {/*    if (!RND(2)) tm=RND(ktop); else */      if (cycle==1) {        if (T_eng) printf("I feel empty. Try another input.\n");          else printf("Czuje pustke. Wybierz inny tekst.\n");        exit(0);     }      if (T_eng) printf("\n\nTime for bed.\n");      else printf("\n\nIde spac.\n");      exit(0);    }  }  last[tm]=1;//  printf("Sentence %d of %d apparently scored best (%d).\n",tm,ktop,lscore[tm]);  cw=0;  while ((*kbase[tm])[cw]) {    strcat(ret,(*kbase[tm])[cw]);    strcat(ret," ");    cw++;  }  if (ret[strlen(ret)-1]==' ') ret[strlen(ret)-1]=0; //  if (!detect_question(input[top][0]))//    if (!RND(3)) {qmark=1; strcat(ret,"?");  }  ret[0]=toupper(ret[0]);  if (!strchr(ret,'?'))    if (detect_question((*kbase[tm])[0])) { qmark=1; strcat(ret,"?"); }  switch (RND(50)) {      case 1: strcat(ret,"!"); break;      case 2: strcat(ret,"..."); break;      case 3: if (!qmark) strcat(ret,"?!"); break;      case 4: strcat(ret," ;-)"); break;      case 5: strcat(ret," ;)"); break;      case 6: strcat(ret," :>"); break;      case 7: strcat(ret," ;>"); break;      case 8: strcat(ret," :P"); break;      case 9: strcat(ret," :("); break;      case 10: strcat(ret," :-("); break;      case 11: strcat(ret," :0"); break;      case 13: strcat(ret," :-("); break;      case 14: strcat(ret," :)"); break;      case 15: strcat(ret," :*"); break;      default: if (!qmark) strcat(ret,"."); break;  }  if (!RND(4) && (zdania>1)) {zdania=0; strcat(ret,"\n\n");} else zdania++;  return ret;}char* cycbuf;int main(void) {  int DOIT;  int this_sentence=0;  int top=0;  char ibuf[INBUF+5];  char* f;  T_eng=(int)getenv("BLOGENG");   f=getenv("SUBJECT");  if (!f) { printf("please specify $SUBJECT.\n"); exit(1); }//  printf(" blog drone bot: %s\n",f);    srand(time(0) ^ getpid());  DOIT=RND(10)+12;  load_kbase();  memset(ibuf,0,sizeof(ibuf));  strncpy(ibuf,f,INBUF);  while (1) {    char* x=ibuf;    int l;    int csen=0,cwo=0,clet=0;    cycle++;    if (cycle>DOIT) exitnow=1;    if (cycbuf) strcpy(ibuf,cycbuf);    while (strlen(x) && strchr(" \t\n\r",*x)) x++;    while ((l=strlen(x)) && strchr(" \t\n\r",x[l-1])) x[l-1]=0;     strcat(x,".");    memset(input,0,sizeof(input));    for (l=0;l<strlen(x);l++) {      if (csen>=MAXSENT) break;      if (cwo>=MAXWORD) { csen++; cwo=0; clet=0; }      if (clet>=MAXWLEN) { cwo++; clet=0; }      if (strchr(";.!?()",x[l])) {        if (this_sentence>1) { this_sentence=0; cwo=MAXWORD; continue; }        this_sentence=0;      }      if (isalnum(x[l])) input[csen][cwo][clet++]=tolower(x[l]); else        if (clet) { cwo++; this_sentence++; clet=0; }    }     memset(ilen,0,sizeof(ilen));    for (l=0;l<MAXSENT;l++) {      while (input[l][ilen[l]][0]) ilen[l]++;      if (ilen[l] >= ilen[top]) top=l;    }        if (cycle>1) { int ltop=0;    l=0;    while (input[top][l][0]) {      if (strlen(input[top][l])>=strlen(input[top][ltop])) ltop=l;      l++;    }    longestword=ltop;    }    // Add favorites.    if (getenv("LEARN")) {       char buf[1024];       int wcnt,cnt,sta,i;       wcnt=0;       while (input[top][wcnt][0]) wcnt++;       sta=RND(wcnt);       cnt=RND(5)+1;       while (sta+cnt>wcnt) cnt--;       if (T_eng) strcpy(buf,"cron-en/"); else strcpy(buf,"cron-pl/");       for (i=sta;i<sta+cnt;i++) {         strcat(buf,input[top][i]);         if (i+1!=sta+cnt) strcat(buf,"+");       }       close(open(buf,O_WRONLY|O_CREAT|O_TRUNC,0644));    }     printf("%s ",cycbuf=handle_input(top));    fflush(0);  }  return 0;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -