<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Beyond Search &#187; ir</title>
	<atom:link href="http://www.guwendong.com/post/tag/ir/feed" rel="self" type="application/rss+xml" />
	<link>http://www.guwendong.com</link>
	<description>最好走的路越走越难，最难走的路越走越容易</description>
	<lastBuildDate>Tue, 31 Jan 2012 05:30:38 +0000</lastBuildDate>
	
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
			<item>
		<title>求助</title>
		<link>http://www.guwendong.com/post/2008/help_for_google_ngram.html</link>
		<comments>http://www.guwendong.com/post/2008/help_for_google_ngram.html#comments</comments>
		<pubDate>Wed, 19 Nov 2008 10:31:38 +0000</pubDate>
		<dc:creator>guwendong</dc:creator>
				<category><![CDATA[技术研究]]></category>
		<category><![CDATA[google]]></category>
		<category><![CDATA[ir]]></category>
		<category><![CDATA[search]]></category>

		<guid isPermaLink="false">http://www.guwendong.com/post/2008/help_for_google_ngram.html</guid>
		<description><![CDATA[第一次通过 blog 求助，希望好心的朋友帮忙！
Google 开放出来的英文 ngram 数据，对我的工作很有帮助。但我通过一些途径联系 LDC 购买，始终得不到回复。阅读我 blog 的朋友，如果有能提供帮助的，烦请联系我，谢谢！
另外，Google Alert  的结果提供了 Feed 输出的方式，可以直接在 Google Reader 里面订阅，非常好用。 

© guwendong for Beyond Search, 2008.
本文网址：http://www.guwendong.com/post/2008/help_for_google_ngram.html
tags: google, ir, search &#124; 参与讨论
<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">您可能也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="Google:在Social和Search之间分裂着" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2Fposts%2F60077.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/05/26/9714919.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Google:在Social和Search之间分裂着 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="新版Google Search界面上线 更加鲜艳" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2F17121.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/01/07/1857791.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">新版Google Search界面上线 更加鲜艳 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Google Search Quality 官方说明" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fgoogle_search_quality_1.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Google Search Quality 官方说明</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Google 评价 blog 的指标" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fgoogle_blog_rank.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Google 评价 blog 的指标</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Google Suggest and Adwords" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fgsuggest_adwords.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/01/04/1801993.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Google Suggest and Adwords</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p>第一次通过 blog 求助，希望好心的朋友帮忙！</p>
<p>Google 开放出来的英文 <a href="http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2006T13">ngram 数据</a>，对我的工作很有帮助。但我通过一些途径联系 LDC 购买，始终得不到回复。阅读我 blog 的朋友，如果有能提供帮助的，烦请联系我，谢谢！</p>
<p>另外，<a href="http://www.google.com/alerts">Google Alert</a>  的结果提供了 Feed 输出的方式，可以直接在 Google Reader 里面订阅，非常好用。<br /> </p>
<hr id="rss-footer" />
<small>© guwendong for <a href="http://www.guwendong.com">Beyond Search</a>, 2008.<br/>
本文网址：<a href="http://www.guwendong.com/post/2008/help_for_google_ngram.html">http://www.guwendong.com/post/2008/help_for_google_ngram.html</a><br/>
tags: <a href="http://www.guwendong.com/post/tag/google" rel="tag">google</a>, <a href="http://www.guwendong.com/post/tag/ir" rel="tag">ir</a>, <a href="http://www.guwendong.com/post/tag/search" rel="tag">search</a> | <a href="http://www.guwendong.com/post/2008/help_for_google_ngram.html#comments">参与讨论</a>
</small><br><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">您可能也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="Google:在Social和Search之间分裂着" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2Fposts%2F60077.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/05/26/9714919.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Google:在Social和Search之间分裂着 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="新版Google Search界面上线 更加鲜艳" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2F17121.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/01/07/1857791.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">新版Google Search界面上线 更加鲜艳 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Google Search Quality 官方说明" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fgoogle_search_quality_1.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Google Search Quality 官方说明</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Google 评价 blog 的指标" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fgoogle_blog_rank.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Google 评价 blog 的指标</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Google Suggest and Adwords" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fgsuggest_adwords.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/01/04/1801993.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Google Suggest and Adwords</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://www.guwendong.com/post/2008/help_for_google_ngram.html/feed</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>研究小记：信息抽取</title>
		<link>http://www.guwendong.com/post/2006/ir_notes.html</link>
		<comments>http://www.guwendong.com/post/2006/ir_notes.html#comments</comments>
		<pubDate>Tue, 07 Nov 2006 12:21:35 +0000</pubDate>
		<dc:creator>guwendong</dc:creator>
				<category><![CDATA[技术研究]]></category>
		<category><![CDATA[ir]]></category>

		<guid isPermaLink="false">http://www.guwendong.com/post/2006/ir_notes.html</guid>
		<description><![CDATA[最近接到一个项目，是关于信息抽取方面的，仔细分析下来，还真的是挺难的。对于现实的应用，如何选取一个最有效的数学模型，这个是非常考验算法功力的事情。因此，这几天把自己闷在家里，网也不上了，Blog也不读了，潜心研究信息抽取(Information Extraction)方面的算法。这其中，又把隐马尔可夫算法(HMM)好好地啃了一下。google china blog 上面有一篇文章《数学之美 系列三 — 隐含马尔可夫模型在语言处理中的应用》，比较经典地讲解了隐马尔可夫算法的应用，是一篇很好的文章。之前，我曾经比较系统地研究过《数学之美》系列的前几篇，还把这几篇放到了我的“每日一贴”栏目中，算是对自己学习的一个记录。虽然把这个栏目的名字定为“每日一贴”，但其实频率远达不到每日一贴。这些 文字不是自己写的，所以更需要咀嚼之后，才能真正地为我所用。如果仅仅就是“贴”一下的话，那还真没这个必要了，浪费时间。因此希望加入“每日一贴”的文 章，都能够真正地对自己有所帮助。
简单陈列一下信息抽取的三大类方法。

基于规则的方法。这个方法解决特定的问题效果比较好，但同时它对被提取信息的要求也比较苛刻。此方法主要基于规则库进行信息抽取，因此，规则库的质量直接绝对了算法的召回率和准确率。通常情况下，尤其是应用在商业项目中，要想编制一个高质量的规则库是不经济的。项目起始不能将此方法作为核心，待有了足够的数据积累之后，通过制作训练模型和算法，可以对整个项目的质量有一定程度的提升。
隐马尔可夫方法。这是经典的信息抽取算法。但它要求信息源的内容之间是有顺序关联的，即，要求数据的排列是有逻辑关系的。对于内容之间相互独立的信息，它的效果不是很好。非常不幸，我这个项目的数据源这是如此。它的内容是分段的，对于这些段落中国人有习惯顺序，但这种习惯顺序并不能抽象化成逻辑关系，因此不适合使用应马尔可夫算法。
基于文本分类的方面。这种方法利用信息之间的独立假设，使用分类算法抽取信息，适用于处理出现次序相互独立信息的抽取问题。配合质量比较高的中文分词算法，信息抽取的精确率与召回率较高。我要做的项目准备以此方法为核心算法。


© guwendong for Beyond Search, 2006.
本文网址：http://www.guwendong.com/post/2006/ir_notes.html
tags: ir &#124; 参与讨论
<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">您可能也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="利用人物角色来做信息架构" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2Fposts%2F56110.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/04/08/5239301.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">利用人物角色来做信息架构 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="用户研究思路概述：以淘宝网SNS’分享’为例" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2Fposts%2F69378.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2012/01/09/13675953.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">用户研究思路概述：以淘宝网SNS’分享’为例 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="手持设备的可用性研究" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2F26865.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/04/08/5240026.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">手持设备的可用性研究 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="基于生活形态的用户分群研究" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2Fposts%2F13407.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/04/08/5259089.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">基于生活形态的用户分群研究 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="求助" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">求助</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p>最近接到一个项目，是关于信息抽取方面的，仔细分析下来，还真的是挺难的。对于现实的应用，如何选取一个最有效的数学模型，这个是非常考验算法功力的事情。因此，这几天把自己闷在家里，网也不上了，Blog也不读了，潜心研究信息抽取(Information Extraction)方面的算法。这其中，又把隐马尔可夫算法(HMM)好好地啃了一下。<a href="http://googlechinablog.com">google china blog</a> 上面有一篇文章《<a href="http://googlechinablog.com/2006/04/blog-post_17.html">数学之美 系列三 — 隐含马尔可夫模型在语言处理中的应用</a>》，比较经典地讲解了隐马尔可夫算法的应用，是一篇很好的文章。之前，我曾经比较系统地研究过《数学之美》系列的前几篇，还把这几篇放到了我的“每日一贴”栏目中，算是对自己学习的一个记录。虽然把这个栏目的名字定为“每日一贴”，但其实频率远达不到每日一贴。这些 文字不是自己写的，所以更需要咀嚼之后，才能真正地为我所用。如果仅仅就是“贴”一下的话，那还真没这个必要了，浪费时间。因此希望加入“每日一贴”的文 章，都能够真正地对自己有所帮助。</p>
<p>简单陈列一下信息抽取的三大类方法。</p>
<ol>
<li>基于规则的方法。这个方法解决特定的问题效果比较好，但同时它对被提取信息的要求也比较苛刻。此方法主要基于规则库进行信息抽取，因此，规则库的质量直接绝对了算法的召回率和准确率。通常情况下，尤其是应用在商业项目中，要想编制一个高质量的规则库是不经济的。项目起始不能将此方法作为核心，待有了足够的数据积累之后，通过制作训练模型和算法，可以对整个项目的质量有一定程度的提升。</li>
<li>隐马尔可夫方法。这是经典的信息抽取算法。但它要求信息源的内容之间是有顺序关联的，即，要求数据的排列是有逻辑关系的。对于内容之间相互独立的信息，它的效果不是很好。非常不幸，我这个项目的数据源这是如此。它的内容是分段的，对于这些段落中国人有习惯顺序，但这种习惯顺序并不能抽象化成逻辑关系，因此不适合使用应马尔可夫算法。</li>
<li>基于文本分类的方面。这种方法利用信息之间的独立假设，使用分类算法抽取信息，适用于处理出现次序相互独立信息的抽取问题。配合质量比较高的中文分词算法，信息抽取的精确率与召回率较高。我要做的项目准备以此方法为核心算法。</li>
</ol>
<hr id="rss-footer" />
<small>© guwendong for <a href="http://www.guwendong.com">Beyond Search</a>, 2006.<br/>
本文网址：<a href="http://www.guwendong.com/post/2006/ir_notes.html">http://www.guwendong.com/post/2006/ir_notes.html</a><br/>
tags: <a href="http://www.guwendong.com/post/tag/ir" rel="tag">ir</a> | <a href="http://www.guwendong.com/post/2006/ir_notes.html#comments">参与讨论</a>
</small><br><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">您可能也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="利用人物角色来做信息架构" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2Fposts%2F56110.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/04/08/5239301.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">利用人物角色来做信息架构 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="用户研究思路概述：以淘宝网SNS’分享’为例" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2Fposts%2F69378.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2012/01/09/13675953.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">用户研究思路概述：以淘宝网SNS’分享’为例 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="手持设备的可用性研究" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2F26865.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/04/08/5240026.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">手持设备的可用性研究 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="基于生活形态的用户分群研究" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.alibuybuy.com%2Fposts%2F13407.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2011/04/08/5259089.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">基于生活形态的用户分群研究 (@alibuybuy)</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="求助" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2008%2Fhelp_for_google_ngram.html&from=http%3A%2F%2Fwww.guwendong.com%2Fpost%2F2006%2Fir_notes.html">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">求助</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://www.guwendong.com/post/2006/ir_notes.html/feed</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>

