s=boost::regex_replace(s,expression,"");
boost::regex expression1("(<style.*?>)(.*?)(</style>)",Para); //去除样式
s=boost::regex_replace(s,expression1,"");
boost::regex expression2("(<!--.*?-->)",Para); //去除注释
s=boost::regex_replace(s,expression2,"");
htmlcode=s.c_str(); //stirng类型转换为CString类型
return htmlcode;
}
CString CHtmlcodeIE::ExtractAllText(CString htmlcode)
{
std::string s=(string)htmlcode;
boost::regex expression("(.*?)(<.*?>)(.*?)",Para);
s=boost::regex_replace(s,expression,"$1$3"); //$2为保留标记
htmlcode=s.c_str();
return htmlcode;
}
CString CHtmlcodeIE::ExtractTextA(CString htmlcode)
{
CString str,ret,retstr,groupstr;
boost::regex express("((?:option .*?value=[\"|\'](.*?)[\"|\'].*?)|(?:a .*?href=[\"|\'](.*?)[\"|\'].*?))>([^>]+)<",Para);
boost::cmatch result; //匹配结果变量
str=htmlcode;
while(boost::regex_search(str, result, express))
{
groupstr="";
for(int i=result.size()-1; i>=2; i--)
{
ret = result[i].str().c_str();
if(i==result.size()-1)
{
ret=DeleteTag(ret); //如果是文本,过滤其中的无用代码
if(ret=="")
{ break;}
}
if(ret!="")
{ groupstr=groupstr+ret+" "; }
}
if(groupstr!="")
{ retstr=retstr+groupstr+"\r\n"; }
str = result[result.size()].second; }
|