⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 uparsertypethread.pas

📁 这是一个从指定网页格式分离单词的小程序
💻 PAS
字号:
unit uParserTypeThread;

interface

uses
  Classes, Windows, PerlRegEx, ActiveX, ADODB, SysUtils;

type
  TParserTypeThread = class(TThread)
    FsPathURL: string;                //URL路径
    FsFileName: string;               //要解析的文件路径
    FProHandle: HWND;                 //处理消息的Handle
    FTaskID: integer;                 //调用线程分配的ID
    FSource: TStringList;             //要解析的内容
    FTarget: TStringList;             //解析后的内容
    FPerlRegEx: TPerlRegEx;           //正则表达式解析组件
    FADOCon: TADOConnection;          //连接数据库组件
    FProADOQ: TADOQuery;              //添加指定数据
    FErrorMsg: integer;               //-1 未处理 0 正确  1 链接数据库出错  2 加载文件出错 3 线程被中断
  private
    { Private declarations }
    function InitADOConn: boolean;    //初始化连接
    procedure ParserOver;
  protected
    procedure Execute; override;

  public
    constructor Create(mdbLinkStr, sPathURL, sFileName:String;
      ProHandle: HWND; TaskID: integer; CreateSuspended: boolean);
    destructor Destroy; override;
  end;

implementation

uses uConst, uPublic;



constructor TParserTypeThread.Create(mdbLinkStr, sPathURL, sFileName: String;
  ProHandle: HWND; TaskID: integer; CreateSuspended: boolean);
begin
  inherited Create(CreateSuspended);
  FsPathURL := sPathURL;           //URL路径
  FsFileName := sFileName;         //要解析的文件路径
  FProHandle := ProHandle;         //处理消息的Handle
  FTaskID := TaskID;               //调用线程分配的ID

  Activex.CoInitialize(nil);
  FSource := TStringList.Create;
  FTarget := TStringList.Create;
  FPerlRegEx := TPerlRegEx.Create(nil);
  FADOCon := TADOConnection.Create(nil);
  FADOCon.ConnectionString := mdbLinkStr;
  FADOCon.LoginPrompt := False;
  FProADOQ := TADOQuery.Create(nil);
  FProADOQ.Connection := FADOCon;

  FErrorMsg := -1; //标识为没有处理过的类型
end;

destructor TParserTypeThread.Destroy;
begin
  FSource.Free;
  FTarget.Free;
  FPerlRegEx.Free;
  FProADOQ.Free;
  FADOCon.Free;
  Activex.CoUninitialize;
  Synchronize(ParserOver);
  inherited Destroy;
end;

procedure TParserTypeThread.Execute;
var i, Count, CurLen: integer;
    MatchSwapURL,MatchSwapName: array of string;
    tmpS: string;
    IsNewInsert: boolean;//标识有插入的
begin
  { Place thread code here }
  FreeOnTerminate := True;              //当线程结束时自动释放
  if not InitADOConn then
  begin
    FErrorMsg := 1; //标识为连接数据库出错
    Exit;
  end;//if
  if not Terminated then
  begin
    try
      FSource.Clear;
      FSource.LoadFromFile(FsFileName);
    except
      // SendMessage(FProHandle,WM_SendErrorMsg,FTaskID,0);//发送处理失败消息
      FErrorMsg := 2;//标识为加载文件失败
      Exit;
    end;//try
  end;//if

  FErrorMsg := 3;

  FTarget.Clear;
  {开始解析}

  FPerlRegEx.Options := [preMultiLine];
  FPerlRegEx.Subject := FSource.Text;

  FPerlRegEx.RegEx := SRegExWordA;
  if FPerlRegEx.Match then
  repeat{获取词类<A ... > ... </A>链接}
    tmpS := StringReplace(FPerlRegEx.MatchedExpression,#13#10,'',[rfReplaceAll]);
    if tmpS <> '' then FTarget.Add(tmpS);
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);

  CurLen := FTarget.Count;
  SetLength(MatchSwapURL, CurLen);
  SetLength(MatchSwapName, CurLen);


  FPerlRegEx.RegEx := SRegExWordAURL;
  FPerlRegEx.Subject := FTarget.Text;
  i := 0;
  if FPerlRegEx.Match then
  repeat{获取Href部分}
    if i >= CurLen then
    begin
      i := 0;
      Break;
    end;//if

    MatchSwapURL[i] := ParserHref(FPerlRegEx.MatchedExpression);
    inc(i);
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);
  SetLength(MatchSwapURL, i);

  FPerlRegEx.RegEx := SRegExWordAName;
  i := 0;
  if FPerlRegEx.Match then
  repeat{获取Name部分}
    if i >= CurLen then
    begin
      i := 0;
      Break;
    end;//if

    tmpS := Trim(FPerlRegEx.MatchedExpression);
    MatchSwapName[i] := Copy(tmpS,2,Length(tmpS) - 1);
    inc(i);
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);
  SetLength(MatchSwapName, i);

  Count := Length(MatchSwapURL);
  if Count = Length(MatchSwapName) then
  begin{名称和URL对应上的话}
    i := 0;
    while (i < Count) and (not Terminated) do
    begin{将单词类别插入TEnWordClass表}
      if GetCountCondition('FName','TEnWordClass',
        'FName="' + MatchSwapName[i] + '"',FProADOQ) = 0 then
      begin//找不到则插入
        ExecADOQ('insert into TEnWordClass(FName,FURL) ' +
          ' values("' + MatchSwapName[i] + '","' +
           FsPathURL + MatchSwapURL[i] + '")',FProADOQ);
      end
      else
      begin//找到更新
        ExecADOQ('update TEnWordClass set FURL="' +
           FsPathURL + MatchSwapURL[i] + '" where FName="' +
           MatchSwapName[i] + '"' ,FProADOQ);
      end;//if
      inc(i);
      Sleep(0);
    end;//while
  end;//if

  //抓取其它类似网页
  FTarget.Clear;
  FPerlRegEx.Options := [preMultiLine];
  FPerlRegEx.Subject := FSource.Text;

  FPerlRegEx.RegEx := SRegExPageA;
  if FPerlRegEx.Match then
  repeat{获取Page类的页面链接<A ... > ... </A>链接}
    tmpS := StringReplace(FPerlRegEx.MatchedExpression,#13#10,'',[rfReplaceAll]);
    FTarget.Add(tmpS);
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);

  CurLen := FTarget.Count;
  SetLength(MatchSwapURL, CurLen);
  SetLength(MatchSwapName, CurLen);
  FPerlRegEx.RegEx := SRegExWordAURL;
  FPerlRegEx.Subject := FTarget.Text;
  i := 0;
  if FPerlRegEx.Match then
  repeat{获取Href部分}
    if i >= CurLen then
    begin
      i := 0;
      Break;
    end;//if

    MatchSwapURL[i] := ParserHref(FPerlRegEx.MatchedExpression);
    inc(i);
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);
  SetLength(MatchSwapURL, i);

  FPerlRegEx.RegEx := SRegExWordAName;
  i := 0;
  if FPerlRegEx.Match then
  repeat{获取Name部分}
    if i >= CurLen then
    begin
      i := 0;
      Break;
    end;//if

    tmpS := Trim(FPerlRegEx.MatchedExpression);
    MatchSwapName[i] := Copy(tmpS,2,Length(tmpS) - 1);
    inc(i);
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);
  SetLength(MatchSwapName, i);

  Count := Length(MatchSwapURL);
  if Count = Length(MatchSwapName) then
  begin{名称和URL对应上的话}
    i := 0;
    IsNewInsert := False;
    while (i < Count) and (not Terminated) do
    begin{将单词类别插入TEnWordClass表}
      if GetCountCondition('FURL','TTmpURLList',
        'FURL = ''' + FsPathURL + 'article_list.asp' +
         MatchSwapURL[i] + '''',FProADOQ) = 0 then
      begin//找不到则插入
        ExecADOQ('insert into TTmpURLList(FName,FURL,FType) ' +
          ' values("' + MatchSwapName[i] + '","' +
           FsPathURL + 'article_list.asp' + MatchSwapURL[i] + '",0)',FProADOQ);
        IsNewInsert := True;
      end;
      inc(i);
      Sleep(0);
    end;//while

    if IsNewInsert then
    begin
      //SendMessage(FProHandle,WM_CheckTmpURLList,0,0);//发送更新相似页面消息
      PostMessage(FProHandle,WM_CheckTmpURLList,0,0);//发送更新相似页面消息
    end;//if
  end;//if

  //抓取另一类
  FTarget.Clear;
  FPerlRegEx.Options := [preMultiLine];
  FPerlRegEx.Subject := FSource.Text;

  FPerlRegEx.RegEx := SRegExWordB;
  if FPerlRegEx.Match then
  repeat{获取Page类的页面链接<A ... > ... </A>链接}
    tmpS := StringReplace(FPerlRegEx.MatchedExpression,#13#10,'',[rfReplaceAll]);
    FTarget.Add(tmpS);
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);

  CurLen := FTarget.Count;
  SetLength(MatchSwapURL, CurLen);
  SetLength(MatchSwapName, CurLen);
  FPerlRegEx.RegEx := SRegExWordBURL;
  FPerlRegEx.Subject := FTarget.Text;
  i := 0;
  if FPerlRegEx.Match then
  repeat{获取Href部分}
    if i >= CurLen then
    begin
      i := 0;
      Break;
    end;//if
    MatchSwapURL[i] := ParserHref(FPerlRegEx.MatchedExpression);
    inc(i);
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);
  SetLength(MatchSwapURL, i);

  FPerlRegEx.RegEx := SRegExWordAName;
  i := 0;
  if FPerlRegEx.Match then
  repeat{获取Name部分}
    if i >= CurLen then
    begin
      i := 0;
      Break;
    end;//if

    tmpS := Trim(FPerlRegEx.MatchedExpression);
    if Length(tmpS) > 2 then
    begin
      MatchSwapName[i] := Copy(tmpS,2,Length(tmpS) - 1);
      inc(i);
    end;
    Sleep(0);
  until (not Terminated) and (not FPerlRegEx.MatchAgain);
  SetLength(MatchSwapName, i);

  Count := Length(MatchSwapURL);
  if Count = Length(MatchSwapName) then
  begin{名称和URL对应上的话}
    i := 0;
    IsNewInsert := False;
    while (i < Count) and (not Terminated) do
    begin{将单词类别插入TEnWordClass表}
      if GetCountCondition('FURL','TTmpURLList',
        'FURL = "' + FsPathURL + MatchSwapURL[i] + '"',FProADOQ) = 0 then
      begin//找不到则插入
        ExecADOQ('insert into TTmpURLList(FName,FURL,FType) ' +
          ' values("' + MatchSwapName[i] + '","' +
           FsPathURL + MatchSwapURL[i] + '",0)',FProADOQ);
        IsNewInsert := True;
      end;
      inc(i);
      Sleep(0);
    end;//while
    if IsNewInsert then
    begin
      //SendMessage(FProHandle,WM_CheckTmpURLList,0,0);//发送更新相似页面消息
      PostMessage(FProHandle,WM_CheckTmpURLList,0,0);//发送更新相似页面消息
    end;//if
  end;//if


  //SendMessage(FProHandle,WM_SendErrorMsg,FTaskID,1);//发送处理成功消息
  FErrorMsg := 0; //标识为正确分析完毕
end;

function TParserTypeThread.InitADOConn: boolean;
begin
  Result := True;
  try
    FADOCon.Close;
    FADOCon.Open;
  except
    Result := False;
  end;//try
end;

procedure TParserTypeThread.ParserOver;
begin
  //完成处理信息
  ProOnParserTypeOver(FTaskID,FErrorMsg);
end;

end.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -