📄 parse.cs

📁 本代码是用C#结合正则表达式处理字符串的源码,针对网页特殊html的源代码的替换,过滤,查找等强大功能.
💻 CS
字号:
public post()
{

string param =
                "title=" + HttpUtility.UrlEncodeUnicode(title)
            + "&content=" + HttpUtility.UrlEncode(detail)
            + "&areano=" + HttpUtility.UrlEncode(provinceno.ToString())
            + "&vocano=" + HttpUtility.UrlEncode(ItemHelper.DistinctVoca(industryup))
            + "&inputtime=" + HttpUtility.UrlEncode(date.ToString());

Encoding encoding = Encoding.GetEncoding("GB2312");

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";

StreamWriter requestWriter = new StreamWriter(request.GetRequestStream());
requestWriter.Write(param);
requestWriter.Close();

StreamReader responseReader = new StreamReader(request.GetResponse().GetResponseStream(), encoding);
string s = responseReader.ReadToEnd();
responseReader.Close();
return s;
}


public void GetContent()
{
            //解析页面数据
            Match m =
                Regex.Match(content, @"<td[^>]*>(详细内容[：:]|中标内容[：:]|招标内容[：:])</td>([\s\S])*?<\s*hr([\s\S]*?)>",
                            RegexOptions.IgnoreCase);
            string detail = "";
            if (m.Success)
            {
                detail = m.Value;
                detail =
                    Regex.Replace(detail, @"\r|\n|\t|(<\s*script[\s\S]*?</\s*script([\s\S]*?)>)|(<!--([\s\S])*?-->)", "",
                                  RegexOptions.IgnoreCase);
                detail = Regex.Replace(detail, @"　|(&nbsp;)|(</\s*td([\s\S]*?)>)", " ", RegexOptions.IgnoreCase);
                detail =
                    Regex.Replace(detail, @"(</\s*tr([\s\S]*?)>)|(</\s*p([\s\S]*?)>)|(<\s*br\s*([\s\S]*?)>)", "\r\n",
                                  RegexOptions.IgnoreCase);
                detail = Regex.Replace(detail, @"<[^>]*>", "");
                detail = Regex.Replace(detail, @"(\A\s*(详细内容|中标内容)[：:])\s*", "");

                XmlDocument doc = new XmlDocument();
                doc.Load(Application.StartupPath + @"\Config.xml");
                XmlNodeList nodeList = doc.SelectNodes("//section");
                foreach (XmlNode node in nodeList)
                {
                    detail = detail.Replace(node.InnerText, string.Empty); 
                }

                detail = Regex.Replace(detail, @"(\s*\r\n\s*)+", "\r\n");
                detail = Regex.Replace(detail, @"( )+", " ");
            }


            if (detail.Trim() == "")
            {
                ItemHelper.SetDataNoContent(ID);
                return;
            }

            if (detail.Length > 4000)
            {
                ItemHelper.SetDataTooLength(ID);
                return;
            }



            m = Regex.Match(content, @"<td[^>]*>所属地区[：:]([\s\S])*?</\s*td([\s\S]*?)>", RegexOptions.IgnoreCase);
            int provinceno = -1;
            if (m.Success)
            {
                string province = m.Value;
                province = Regex.Replace(province, @"<[^>]*>", "");
                province = Regex.Replace(province, "所属地区[：:]", "").Trim();
                XmlDocument doc = new XmlDocument();
                doc.Load(Application.StartupPath + @"\Config.xml");
                XmlNodeList nodeList = doc.SelectNodes("//areaNode");
                foreach (XmlNode node in nodeList)
                {
                    if (node.Attributes["name"].Value == province)
                    {
                        provinceno = int.Parse(node.Attributes["id"].Value);
                        break;
                    }
                }
            }

            m = Regex.Match(content, @"<td[^>]*>信息类别[：:]([\s\S])*?</\s*td([\s\S]*?)>", RegexOptions.IgnoreCase);
            string infoType = string.Empty;
            if (m.Success)
            {
                infoType = m.Value;
                infoType = Regex.Replace(infoType, @"<[^>]*>", "");
                infoType = Regex.Replace(infoType, "信息类别[：:]", "").Trim();
            }

            m = Regex.Match(content, @"<td[^>]*>所属行业[：:]([\s\S])*?</\s*td([\s\S]*?)>", RegexOptions.IgnoreCase);
            string industryup = "";
            if (m.Success)
            {
                string industry = m.Value;
                industry = Regex.Replace(industry, @"<[^>]*>", "");
                industry = Regex.Replace(industry, "所属行业[：:]", "").Trim();
                industryup = industry;
                XmlDocument doc = new XmlDocument();
                doc.Load(Application.StartupPath + @"\Config.xml");
                XmlNodeList nodeList = doc.SelectNodes("//industryNode");
                foreach (XmlNode node in nodeList)
                {
                    industryup = industryup.Replace(node.Attributes["name"].Value, node.Attributes["id"].Value);
                }
                string[] tempindu = industry.Split(",");
                string[] tempindu2;
                foreach(var tt in tempindu)
                {
                    if(tempindu.
                }
                tempindu.u
            }

            m = Regex.Match(content, @"相关附件", RegexOptions.IgnoreCase);
            bool hasFile = false;
            if (m.Success)
            {
                hasFile = true;
            }

}

public void GetList()
{
 Match trList = Regex.Match(content, @"(<\s*tr[\s\S]*?</\s*tr([\s\S]*?)>)", RegexOptions.IgnoreCase);
            while (trList.Success)
            {
                if (string.IsNullOrEmpty(trList.Value))
                {
                    break;
                }

                Match m = Regex.Match(trList.Value, @"_detail\.jsp\?", RegexOptions.IgnoreCase);
                if (m.Success)
                {
                    MatchCollection matches =
                        Regex.Matches(trList.Value, @"(<\s*td[\s\S]*?</\s*td([\s\S]*?)>)", RegexOptions.IgnoreCase);

                    string d_url = string.Empty;
                    string title = string.Empty;
                    string d_type = string.Empty;
                    DateTime time = new DateTime(2000, 1, 1);
                    string timestring = string.Empty;

                    string tableType = string.Empty;
                    if (matches.Count == 4)
                    {
                        Match matchUrl =
                            Regex.Match(matches[1].Value,
                                        @"(?:href\s*=)(?:[\s""']*)(?!#|mailto|location.|javascript|.*css|.*this\.)(?<url>.*?)(?:[\s>""'])",
                                        RegexOptions.IgnoreCase);
                        if (matchUrl.Success)
                        {
                            d_url = matchUrl.Groups["url"].Value.Trim();
                            int length1 = d_url.LastIndexOf("/");
                            int length2 = d_url.LastIndexOf("_detail.asp");
                            tableType = d_url.Substring(length1 + 1, length2 - length1 - 1);
                            title = Regex.Replace(matches[1].Value, "<[^>]*>", "").Trim().Replace("（替换）", "");
                        }
                        d_type = Regex.Replace(matches[2].Value, "<[^>]*>", "").Trim();
                        timestring = Regex.Replace(matches[3].Value, "<[^>]*>", "").Trim();
                        time = DateTime.Parse(timestring);
                    }

                    if (_list.ContainsKey(d_url))
                    {
                        trList = trList.NextMatch();
                        continue;
                    }

                    if ((DateTime.Today - time).Days > _days)
                    {
                        throw new Exception("");
                    }

                    AddUrlToList(d_url);

                    //保存数据到Access
                    string sqlInsert =
                        String.Format(
                            "insert into ",
                            d_url, title.Replace("'", "\""), timestring, d_type, tableType);
                    OleDbHelper.ExecuteNonQuery(OleDbHelper.ConntectionString, CommandType.Text, sqlInsert, null);

                    if (ListHandler != null)
                    {
                        ListHandler(d_url);
                    }
                }
                trList = trList.NextMatch();
            }
}
💿 文件大小 3 K
👤 上传用户 machao844655
📂 所属分类多国语言处理
🏷️ 相关标签

#html #代码 #正 #字符串
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -