📄 blogdrone.c
字号:
// Catty v2// --------// Copyright (C) 2001, 2002 by Michal Zalewski <lcamtuf@coredump.cx>// Highly optimized for lower memory usage to be gentle with coredump.// Speed improvement: we can use e.g. one byte hash// for every word to avoid slow strcmp, but does it make any sense?// We can also add strcmp caching for a given pointer.int cycle;int exitnow;#include <stdio.h>#include <unistd.h>#include <stdlib.h>#include "string-486.h"#include <sys/time.h>#include <ctype.h>#include <fcntl.h>#include <unistd.h>#include <sys/mman.h>#define RND(x) ((int) (((float)x)*rand()/(RAND_MAX+1.0)))// #define CACHE_DEBUG 1#include "config.h"#include "lang.h"int T_eng;char input[MAXSENT+1][MAXWORD+1][MAXWLEN+1];char* ic[MAXWORD+1];int ilen[MAXKS+1];int lscore[MAXKS+1];unsigned int last[MAXKS+1];int longestword=-1;char* (*kbase[MAXKS+1])[];int ktop;struct word { char* txt; struct word* next;};#define MEMINCR (10*1024*1024)char* mem;int msize;int mptr;// Rationale for this: we need allocate-and-forget memory. Typical// malloc implementation wastes tons of memory on padding, dynamic// structs etc, which is pretty painful if you create e.g. 100000// 6-byte items - and that's what we do. To minimize this, we organize// them into one-megabyte chunks. // Update: we also use mapped files for storage. Economy mode, almost ;-)inline void* my_malloc(int add);inline char* my_strdup(char* x) { int add; add=strlen(x)+1; return strcpy(my_malloc(add),x);}inline void* my_malloc(int add) { char* ret; if (!mem || (add+mptr >= msize)) { int q; q=open(".MaP",O_RDWR|O_CREAT,0600); ftruncate(q,MEMINCR+1); mem=mmap(0,MEMINCR+1,PROT_READ|PROT_WRITE,MAP_SHARED,q,0); unlink(".MaP"); close(q); // printf("F"); fflush(0); msize=MEMINCR; mptr=0; } ret=(mem+mptr); mptr+=add; return ret;}// The idea here is to cache words, so we don't strdup() same word// appearing 100000 times over the database, and we save lots of memory.// This, however, requires very fast lookups, because we're going to make// something to the order of ten million lookups while loading catty. To// do this, we use a quick hashing function for first four characters// to look up the linked list where this word is located (or should be// appended).struct word* wcache[65536];/* Typical ASCII alnum character has 6 significant bytes ;-) Hashing table word: 0123456789ABCDEF AAAAAA BBBBBB (<<3) CCCCCC (<7) DDDDDD (<<10) */#define HASH(a,b,c,d) ((unsigned short) ( (a) ^ ((b) << 3) ^ \ ((c) << 7) ^ ((d) << 10)))#ifdef CACHE_DEBUGint hits, misses;#endifinline char* getcache(unsigned char* x) { struct word* w; char c[4]; memcpy(c,x,4); if (!c[1]) c[2]=c[3]=0; else if (!c[2]) c[3]=0; if (!(w=wcache[HASH(c[0],c[1],c[2],c[3])])) { w=my_malloc(sizeof(struct word)); w->txt=my_strdup(x); w->next=0; wcache[HASH(c[0],c[1],c[2],c[3])]=w;#ifdef CACHE_DEBUG misses++;#endif return w->txt; } while (w->next) { if (!strcmp(x,w->txt)) #ifdef CACHE_DEBUG { hits++; return w->txt; }#else return w->txt;#endif w=w->next; } if (!strcmp(x,w->txt))#ifdef CACHE_DEBUG { hits++; return w->txt; }#else return w->txt;#endif w->next=my_malloc(sizeof(struct word)); w->next->txt=my_strdup(x); w->next->next=0;#ifdef CACHE_DEBUG misses++;#endif return w->next->txt;}// We load the database, splitting phrases into words. This ain't// terribly fast, but reasonable. Smarter approach would simply load// pre-processed binary data from the file, but Catty is not supposed// to be restarted more often than once few weeks, so we can waste a// minute for startup.void load_kbase(void) { unsigned char buf[1024]; int l,skip; FILE* f; skip=(int)getenv("LITEVER"); if (T_eng) { f=fopen(KENG_FILE,"r"); } else { f=fopen(KBASE_FILE,"r"); } if (!f) { perror(KBASE_FILE); exit(1); } while (fgets(buf,1024,f)) { char* x=buf,*y; int cw=0,wcount=0; if (skip) if (RND(5)) continue; while (x[0] && strchr(" \t\n\r",*x)) x++; while ((l=strlen(x)) && strchr("\n\r",x[l-1])) x[l-1]=0; if (!x[0]) continue;// if (!(ktop % 5000)) { printf("."); fflush(0); } y=buf; while (y) { y=strchr(y+1,' '); if (y) wcount++; } kbase[ktop]=my_malloc(4*(wcount+1)); while (1) { int len; y=strchr(x,' '); if (!y) break; *y=0; len=strlen(x); if (len>MAXWLEN) { cw=0; break; } // Nonsense, yes if (len) (*kbase[ktop])[cw++]=getcache(x); if (cw>=MAXWORD) { cw=0; break; } // Skip this nonsense x=y+1; } (*kbase[ktop])[cw]=0; if (cw) ktop++; if (ktop>=MAXKS) { printf("MAXKS exceeded!\n"); exit(1); } } fclose(f); // printf("done (%d rants)\n",ktop);#ifdef CACHE_DEBUG printf("CACHE STATS: %d hits, %d misses.\n",hits,misses);#endif/* for (l=0;l<20;l++) { int n; for (n=0;n<20;n++) { if (!(*kbase[l])[n]) break; printf("[%s] ",(*kbase[l])[n]); } printf("\n"); }*/} inline int detect_question(char* where) { int i; if (T_eng) { for (i=0;i<sizeof(engq)/sizeof(char*);i++) if (!strcmp(where,engq[i])) return 1; } else { for (i=0;i<sizeof(question)/sizeof(char*);i++) if (!strcmp(where,question[i])) return 1; } return 0;}int zdania;// Our royal pain ;-) Main algorithm:// ...// For every word A of input// For every phrase P in the knowledgebase// Score Increase = 0// For every word of B of P// Sequence = 0// While (B+Sequence == A+Sequence and B+Sequence != B+Sequence-1) // Sequence++// Score Increase += Length(A+Sequence)^2// If Sequence != 0, break loop// Score[B] += Score Increase^2;// ...// Then, we do some other stuff, like randomizing scores a bit,// lowering scores for phrases that were used recently, and such.// But generally, we pick the best match.char* handle_input(int top) { static char ret[10000]; int cw=0,c; int i,tm=0; int qmark=0; ret[0]=0; if (detect_question(input[top][0]) && input[top][1][0]) cw=1; // Reset linescores memset(lscore,0,sizeof(lscore)); c=cw; memset(ic,0,sizeof(ic)); i=0; while (input[top][i][0]) { ic[i]=getcache(input[top][i]); i++; } // For each word of input A while (ic[cw]) { if (cw==longestword) { cw++; break; } // printf("Word %d, 95002 = %d, 1 = %d\n",cw,lscore[95002],lscore[1]); // For each line of kbase N for (i=0;i<ktop;i++) { char lmatch[MAXWORD+2]; int kw=0; memset(lmatch,0,sizeof(lmatch)); // For each word B of N while ((*kbase[i])[kw]) { // seq=0 int seq=0; int matched=0; static char *f,*prev; // avoid stupid warning // while A+seq == B+seq, seq++ while ((f=(*kbase[i])[kw+seq]) && (f==ic[cw+seq])) { if (cw+seq==longestword) { seq++; break; } if (!lmatch[cw+seq] && ( !seq || f!=prev)) { int len=strlen(f); lmatch[cw+seq]=1; matched+=/* len* */ len; seq++; prev=f; } else break; } // linescore[N]+=seq*seq lscore[i]+=matched*matched; if (seq) break; kw++; } } cw++; } // Pick line with best linescore for (i=1;i<ktop;i++) { int wc=0; if (last[i]) continue; while ((*kbase[i])[wc]) wc++; if (wc<4) wc=1; else if (wc<10) wc=2; else wc=3; lscore[i]*=wc; lscore[i]+=RND(lscore[i]); if (lscore[i] >= lscore[tm]) tm=i; } if (!lscore[tm]) { // Start a new paragraph if (!exitnow && cycle) { if (zdania>1) printf("\n\n"); tm=RND(ktop); } else {/* if (!RND(2)) tm=RND(ktop); else */ if (cycle==1) { if (T_eng) printf("I feel empty. Try another input.\n"); else printf("Czuje pustke. Wybierz inny tekst.\n"); exit(0); } if (T_eng) printf("\n\nTime for bed.\n"); else printf("\n\nIde spac.\n"); exit(0); } } last[tm]=1;// printf("Sentence %d of %d apparently scored best (%d).\n",tm,ktop,lscore[tm]); cw=0; while ((*kbase[tm])[cw]) { strcat(ret,(*kbase[tm])[cw]); strcat(ret," "); cw++; } if (ret[strlen(ret)-1]==' ') ret[strlen(ret)-1]=0; // if (!detect_question(input[top][0]))// if (!RND(3)) {qmark=1; strcat(ret,"?"); } ret[0]=toupper(ret[0]); if (!strchr(ret,'?')) if (detect_question((*kbase[tm])[0])) { qmark=1; strcat(ret,"?"); } switch (RND(50)) { case 1: strcat(ret,"!"); break; case 2: strcat(ret,"..."); break; case 3: if (!qmark) strcat(ret,"?!"); break; case 4: strcat(ret," ;-)"); break; case 5: strcat(ret," ;)"); break; case 6: strcat(ret," :>"); break; case 7: strcat(ret," ;>"); break; case 8: strcat(ret," :P"); break; case 9: strcat(ret," :("); break; case 10: strcat(ret," :-("); break; case 11: strcat(ret," :0"); break; case 13: strcat(ret," :-("); break; case 14: strcat(ret," :)"); break; case 15: strcat(ret," :*"); break; default: if (!qmark) strcat(ret,"."); break; } if (!RND(4) && (zdania>1)) {zdania=0; strcat(ret,"\n\n");} else zdania++; return ret;}char* cycbuf;int main(void) { int DOIT; int this_sentence=0; int top=0; char ibuf[INBUF+5]; char* f; T_eng=(int)getenv("BLOGENG"); f=getenv("SUBJECT"); if (!f) { printf("please specify $SUBJECT.\n"); exit(1); }// printf(" blog drone bot: %s\n",f); srand(time(0) ^ getpid()); DOIT=RND(10)+12; load_kbase(); memset(ibuf,0,sizeof(ibuf)); strncpy(ibuf,f,INBUF); while (1) { char* x=ibuf; int l; int csen=0,cwo=0,clet=0; cycle++; if (cycle>DOIT) exitnow=1; if (cycbuf) strcpy(ibuf,cycbuf); while (strlen(x) && strchr(" \t\n\r",*x)) x++; while ((l=strlen(x)) && strchr(" \t\n\r",x[l-1])) x[l-1]=0; strcat(x,"."); memset(input,0,sizeof(input)); for (l=0;l<strlen(x);l++) { if (csen>=MAXSENT) break; if (cwo>=MAXWORD) { csen++; cwo=0; clet=0; } if (clet>=MAXWLEN) { cwo++; clet=0; } if (strchr(";.!?()",x[l])) { if (this_sentence>1) { this_sentence=0; cwo=MAXWORD; continue; } this_sentence=0; } if (isalnum(x[l])) input[csen][cwo][clet++]=tolower(x[l]); else if (clet) { cwo++; this_sentence++; clet=0; } } memset(ilen,0,sizeof(ilen)); for (l=0;l<MAXSENT;l++) { while (input[l][ilen[l]][0]) ilen[l]++; if (ilen[l] >= ilen[top]) top=l; } if (cycle>1) { int ltop=0; l=0; while (input[top][l][0]) { if (strlen(input[top][l])>=strlen(input[top][ltop])) ltop=l; l++; } longestword=ltop; } // Add favorites. if (getenv("LEARN")) { char buf[1024]; int wcnt,cnt,sta,i; wcnt=0; while (input[top][wcnt][0]) wcnt++; sta=RND(wcnt); cnt=RND(5)+1; while (sta+cnt>wcnt) cnt--; if (T_eng) strcpy(buf,"cron-en/"); else strcpy(buf,"cron-pl/"); for (i=sta;i<sta+cnt;i++) { strcat(buf,input[top][i]); if (i+1!=sta+cnt) strcat(buf,"+"); } close(open(buf,O_WRONLY|O_CREAT|O_TRUNC,0644)); } printf("%s ",cycbuf=handle_input(top)); fflush(0); } return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -