📄 uparsertypethread.pas
字号:
unit uParserTypeThread;
interface
uses
Classes, Windows, PerlRegEx, ActiveX, ADODB, SysUtils;
type
TParserTypeThread = class(TThread)
FsPathURL: string; //URL路径
FsFileName: string; //要解析的文件路径
FProHandle: HWND; //处理消息的Handle
FTaskID: integer; //调用线程分配的ID
FSource: TStringList; //要解析的内容
FTarget: TStringList; //解析后的内容
FPerlRegEx: TPerlRegEx; //正则表达式解析组件
FADOCon: TADOConnection; //连接数据库组件
FProADOQ: TADOQuery; //添加指定数据
FErrorMsg: integer; //-1 未处理 0 正确 1 链接数据库出错 2 加载文件出错 3 线程被中断
private
{ Private declarations }
function InitADOConn: boolean; //初始化连接
procedure ParserOver;
protected
procedure Execute; override;
public
constructor Create(mdbLinkStr, sPathURL, sFileName:String;
ProHandle: HWND; TaskID: integer; CreateSuspended: boolean);
destructor Destroy; override;
end;
implementation
uses uConst, uPublic;
constructor TParserTypeThread.Create(mdbLinkStr, sPathURL, sFileName: String;
ProHandle: HWND; TaskID: integer; CreateSuspended: boolean);
begin
inherited Create(CreateSuspended);
FsPathURL := sPathURL; //URL路径
FsFileName := sFileName; //要解析的文件路径
FProHandle := ProHandle; //处理消息的Handle
FTaskID := TaskID; //调用线程分配的ID
Activex.CoInitialize(nil);
FSource := TStringList.Create;
FTarget := TStringList.Create;
FPerlRegEx := TPerlRegEx.Create(nil);
FADOCon := TADOConnection.Create(nil);
FADOCon.ConnectionString := mdbLinkStr;
FADOCon.LoginPrompt := False;
FProADOQ := TADOQuery.Create(nil);
FProADOQ.Connection := FADOCon;
FErrorMsg := -1; //标识为没有处理过的类型
end;
destructor TParserTypeThread.Destroy;
begin
FSource.Free;
FTarget.Free;
FPerlRegEx.Free;
FProADOQ.Free;
FADOCon.Free;
Activex.CoUninitialize;
Synchronize(ParserOver);
inherited Destroy;
end;
procedure TParserTypeThread.Execute;
var i, Count, CurLen: integer;
MatchSwapURL,MatchSwapName: array of string;
tmpS: string;
IsNewInsert: boolean;//标识有插入的
begin
{ Place thread code here }
FreeOnTerminate := True; //当线程结束时自动释放
if not InitADOConn then
begin
FErrorMsg := 1; //标识为连接数据库出错
Exit;
end;//if
if not Terminated then
begin
try
FSource.Clear;
FSource.LoadFromFile(FsFileName);
except
// SendMessage(FProHandle,WM_SendErrorMsg,FTaskID,0);//发送处理失败消息
FErrorMsg := 2;//标识为加载文件失败
Exit;
end;//try
end;//if
FErrorMsg := 3;
FTarget.Clear;
{开始解析}
FPerlRegEx.Options := [preMultiLine];
FPerlRegEx.Subject := FSource.Text;
FPerlRegEx.RegEx := SRegExWordA;
if FPerlRegEx.Match then
repeat{获取词类<A ... > ... </A>链接}
tmpS := StringReplace(FPerlRegEx.MatchedExpression,#13#10,'',[rfReplaceAll]);
if tmpS <> '' then FTarget.Add(tmpS);
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
CurLen := FTarget.Count;
SetLength(MatchSwapURL, CurLen);
SetLength(MatchSwapName, CurLen);
FPerlRegEx.RegEx := SRegExWordAURL;
FPerlRegEx.Subject := FTarget.Text;
i := 0;
if FPerlRegEx.Match then
repeat{获取Href部分}
if i >= CurLen then
begin
i := 0;
Break;
end;//if
MatchSwapURL[i] := ParserHref(FPerlRegEx.MatchedExpression);
inc(i);
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
SetLength(MatchSwapURL, i);
FPerlRegEx.RegEx := SRegExWordAName;
i := 0;
if FPerlRegEx.Match then
repeat{获取Name部分}
if i >= CurLen then
begin
i := 0;
Break;
end;//if
tmpS := Trim(FPerlRegEx.MatchedExpression);
MatchSwapName[i] := Copy(tmpS,2,Length(tmpS) - 1);
inc(i);
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
SetLength(MatchSwapName, i);
Count := Length(MatchSwapURL);
if Count = Length(MatchSwapName) then
begin{名称和URL对应上的话}
i := 0;
while (i < Count) and (not Terminated) do
begin{将单词类别插入TEnWordClass表}
if GetCountCondition('FName','TEnWordClass',
'FName="' + MatchSwapName[i] + '"',FProADOQ) = 0 then
begin//找不到则插入
ExecADOQ('insert into TEnWordClass(FName,FURL) ' +
' values("' + MatchSwapName[i] + '","' +
FsPathURL + MatchSwapURL[i] + '")',FProADOQ);
end
else
begin//找到更新
ExecADOQ('update TEnWordClass set FURL="' +
FsPathURL + MatchSwapURL[i] + '" where FName="' +
MatchSwapName[i] + '"' ,FProADOQ);
end;//if
inc(i);
Sleep(0);
end;//while
end;//if
//抓取其它类似网页
FTarget.Clear;
FPerlRegEx.Options := [preMultiLine];
FPerlRegEx.Subject := FSource.Text;
FPerlRegEx.RegEx := SRegExPageA;
if FPerlRegEx.Match then
repeat{获取Page类的页面链接<A ... > ... </A>链接}
tmpS := StringReplace(FPerlRegEx.MatchedExpression,#13#10,'',[rfReplaceAll]);
FTarget.Add(tmpS);
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
CurLen := FTarget.Count;
SetLength(MatchSwapURL, CurLen);
SetLength(MatchSwapName, CurLen);
FPerlRegEx.RegEx := SRegExWordAURL;
FPerlRegEx.Subject := FTarget.Text;
i := 0;
if FPerlRegEx.Match then
repeat{获取Href部分}
if i >= CurLen then
begin
i := 0;
Break;
end;//if
MatchSwapURL[i] := ParserHref(FPerlRegEx.MatchedExpression);
inc(i);
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
SetLength(MatchSwapURL, i);
FPerlRegEx.RegEx := SRegExWordAName;
i := 0;
if FPerlRegEx.Match then
repeat{获取Name部分}
if i >= CurLen then
begin
i := 0;
Break;
end;//if
tmpS := Trim(FPerlRegEx.MatchedExpression);
MatchSwapName[i] := Copy(tmpS,2,Length(tmpS) - 1);
inc(i);
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
SetLength(MatchSwapName, i);
Count := Length(MatchSwapURL);
if Count = Length(MatchSwapName) then
begin{名称和URL对应上的话}
i := 0;
IsNewInsert := False;
while (i < Count) and (not Terminated) do
begin{将单词类别插入TEnWordClass表}
if GetCountCondition('FURL','TTmpURLList',
'FURL = ''' + FsPathURL + 'article_list.asp' +
MatchSwapURL[i] + '''',FProADOQ) = 0 then
begin//找不到则插入
ExecADOQ('insert into TTmpURLList(FName,FURL,FType) ' +
' values("' + MatchSwapName[i] + '","' +
FsPathURL + 'article_list.asp' + MatchSwapURL[i] + '",0)',FProADOQ);
IsNewInsert := True;
end;
inc(i);
Sleep(0);
end;//while
if IsNewInsert then
begin
//SendMessage(FProHandle,WM_CheckTmpURLList,0,0);//发送更新相似页面消息
PostMessage(FProHandle,WM_CheckTmpURLList,0,0);//发送更新相似页面消息
end;//if
end;//if
//抓取另一类
FTarget.Clear;
FPerlRegEx.Options := [preMultiLine];
FPerlRegEx.Subject := FSource.Text;
FPerlRegEx.RegEx := SRegExWordB;
if FPerlRegEx.Match then
repeat{获取Page类的页面链接<A ... > ... </A>链接}
tmpS := StringReplace(FPerlRegEx.MatchedExpression,#13#10,'',[rfReplaceAll]);
FTarget.Add(tmpS);
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
CurLen := FTarget.Count;
SetLength(MatchSwapURL, CurLen);
SetLength(MatchSwapName, CurLen);
FPerlRegEx.RegEx := SRegExWordBURL;
FPerlRegEx.Subject := FTarget.Text;
i := 0;
if FPerlRegEx.Match then
repeat{获取Href部分}
if i >= CurLen then
begin
i := 0;
Break;
end;//if
MatchSwapURL[i] := ParserHref(FPerlRegEx.MatchedExpression);
inc(i);
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
SetLength(MatchSwapURL, i);
FPerlRegEx.RegEx := SRegExWordAName;
i := 0;
if FPerlRegEx.Match then
repeat{获取Name部分}
if i >= CurLen then
begin
i := 0;
Break;
end;//if
tmpS := Trim(FPerlRegEx.MatchedExpression);
if Length(tmpS) > 2 then
begin
MatchSwapName[i] := Copy(tmpS,2,Length(tmpS) - 1);
inc(i);
end;
Sleep(0);
until (not Terminated) and (not FPerlRegEx.MatchAgain);
SetLength(MatchSwapName, i);
Count := Length(MatchSwapURL);
if Count = Length(MatchSwapName) then
begin{名称和URL对应上的话}
i := 0;
IsNewInsert := False;
while (i < Count) and (not Terminated) do
begin{将单词类别插入TEnWordClass表}
if GetCountCondition('FURL','TTmpURLList',
'FURL = "' + FsPathURL + MatchSwapURL[i] + '"',FProADOQ) = 0 then
begin//找不到则插入
ExecADOQ('insert into TTmpURLList(FName,FURL,FType) ' +
' values("' + MatchSwapName[i] + '","' +
FsPathURL + MatchSwapURL[i] + '",0)',FProADOQ);
IsNewInsert := True;
end;
inc(i);
Sleep(0);
end;//while
if IsNewInsert then
begin
//SendMessage(FProHandle,WM_CheckTmpURLList,0,0);//发送更新相似页面消息
PostMessage(FProHandle,WM_CheckTmpURLList,0,0);//发送更新相似页面消息
end;//if
end;//if
//SendMessage(FProHandle,WM_SendErrorMsg,FTaskID,1);//发送处理成功消息
FErrorMsg := 0; //标识为正确分析完毕
end;
function TParserTypeThread.InitADOConn: boolean;
begin
Result := True;
try
FADOCon.Close;
FADOCon.Open;
except
Result := False;
end;//try
end;
procedure TParserTypeThread.ParserOver;
begin
//完成处理信息
ProOnParserTypeOver(FTaskID,FErrorMsg);
end;
end.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -