<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Gabe's Random Weird Stuff</title>
	<atom:link href="http://gabrielegiuseppini.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://gabrielegiuseppini.wordpress.com</link>
	<description>Postcards from a Stranger</description>
	<lastBuildDate>Fri, 22 Apr 2011 11:30:03 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='gabrielegiuseppini.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>Gabe's Random Weird Stuff</title>
		<link>http://gabrielegiuseppini.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://gabrielegiuseppini.wordpress.com/osd.xml" title="Gabe&#039;s Random Weird Stuff" />
	<atom:link rel='hub' href='http://gabrielegiuseppini.wordpress.com/?pushpress=hub'/>
		<item>
		<title>Proud of My Patch Panel</title>
		<link>http://gabrielegiuseppini.wordpress.com/2011/04/22/proud-of-my-patch-panel/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2011/04/22/proud-of-my-patch-panel/#comments</comments>
		<pubDate>Fri, 22 Apr 2011 11:29:33 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Myself]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=422</guid>
		<description><![CDATA[Today was my last day of work at my current company, and I paid a visit to the company&#8217;s offices to return company property and cleanup my drawer. On my way out, I had a last glance at the patch panel of the office &#8211; it took me a few days in February 2010 to [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=422&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Today was my last day of work at my current company, and I paid a visit to the company&#8217;s offices to return company property and cleanup my drawer. </p>
<p>On my way out, I had a last glance at the patch panel of the office &#8211; it took me a few days in February 2010 to set it up, and I have to say I&#8217;m very proud of it. All the short cables are hand-made with precise lengths in order to make the panel look tidy, and important cables are labeled for quick diagnostics (&#8220;wireless&#8221;, &#8220;ADSL&#8221;, etc.).</p>
<p>I couldn&#8217;t bear the idea of the panel entering oblivion, and thus I shot a photo. Here it is, so that is won&#8217;t be lost forever in the archives of my memory&#8230;</p>
<div id="attachment_423" class="wp-caption alignnone" style="width: 370px"><a href="http://gabrielegiuseppini.files.wordpress.com/2011/04/patchpanel-small.jpg"><img src="http://gabrielegiuseppini.files.wordpress.com/2011/04/patchpanel-small.jpg?w=360&#038;h=480" alt="Patch Panel" title="PatchPanel (Small)" width="360" height="480" class="size-full wp-image-423" /></a><p class="wp-caption-text">The patch panel I&#039;ve built and of which I&#039;m very proud.</p></div>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/422/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/422/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/422/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/422/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/422/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/422/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/422/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/422/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/422/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/422/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/422/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/422/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/422/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/422/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=422&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2011/04/22/proud-of-my-patch-panel/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>

		<media:content url="http://gabrielegiuseppini.files.wordpress.com/2011/04/patchpanel-small.jpg" medium="image">
			<media:title type="html">PatchPanel (Small)</media:title>
		</media:content>
	</item>
		<item>
		<title>Chris Eng: How to Become an Information Security Thought Leader</title>
		<link>http://gabrielegiuseppini.wordpress.com/2011/03/14/chris-eng-how-to-become-an-information-security-thought-leader/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2011/03/14/chris-eng-how-to-become-an-information-security-thought-leader/#comments</comments>
		<pubDate>Mon, 14 Mar 2011 09:30:21 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Rants]]></category>
		<category><![CDATA[Software Security]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=416</guid>
		<description><![CDATA[Radi brought to my attention this post from Veracode&#8217;s Chris Eng on How to Become an Information Security Thought Leader. This video reminds me of so many thought leaders I&#8217;ve met. If only I could get them to watch it now<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=416&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://radi.r-n-d.org/">Radi</a> brought to my attention this post from Veracode&#8217;s Chris Eng on <a href="http://www.veracode.com/blog/2010/12/how-to-become-an-information-security-thought-leader/"><i>How to Become an Information Security Thought Leader</i></a>.</p>
<p>This video reminds me of so many <i>thought leaders</i> I&#8217;ve met. If only I could get them to watch it now <img src='http://s2.wp.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> </p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/416/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/416/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/416/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/416/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/416/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/416/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/416/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/416/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/416/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/416/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/416/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/416/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/416/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/416/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=416&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2011/03/14/chris-eng-how-to-become-an-information-security-thought-leader/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>
	</item>
		<item>
		<title>A Dimensional Model for Vulnerability Management</title>
		<link>http://gabrielegiuseppini.wordpress.com/2011/01/25/a-dimensional-model-for-vulnerability-management/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2011/01/25/a-dimensional-model-for-vulnerability-management/#comments</comments>
		<pubDate>Tue, 25 Jan 2011 11:08:43 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Business Intelligence]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=392</guid>
		<description><![CDATA[For the past few months I’ve been working on an interesting problem for a large Dutch financial institution. The driver is a situation quite common in the risk management world: the security office of the institution is trying to centralize the reporting of all risk-related information, and among other things, vulnerability management data plays a [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=392&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>For the past few months I’ve been working on an interesting problem for a large Dutch financial institution. The driver is a situation quite common in the risk management world: the security office of the institution is trying to centralize the reporting of all risk-related information, and among other things, vulnerability management data plays a central role. Information from vulnerability scanners needs to be correlated with information from network management and asset management in order to aggregate vulnerabilities up the IT architecture of the institution. A good design for the solution presents some unique challenges, and since I could not find too much help in the BI literature on the Internet, I thought I’d publish some of the results hoping to be of help to anyone else encountering the same design issues I’ve been fighting with for a while.</p>
<p>A simplified view of the data model for vulnerability management looks as follows:</p>
<ul>
<li>A vulnerability scanner reports the <i>observation</i> of a certain <i>vulnerability class</i> (e.g. of a certain Microsoft security bulletin) on a certain <i>IP address</i>;</li>
<li>An <i>IP address</i> is hosted in a certain <i>country</i> and belongs to a certain <i>host</i> which, in turn, runs a number of <i>IT assets</i> (i.e. applications);</li>
<li>An <i>IT asset</i> has an <i>asset rating</i> and belongs to a certain <i>IT service</i>.</li>
</ul>
<p>After correlating the data, the institution would like to be able to answer questions such as:</p>
<ul>
<li>How many vulnerabilities do we have this month for each hosting country?</li>
<li>How many vulnerabilities affect applications with this particular rating?</li>
<li>What’s the trend of vulnerabilities for each IT service for the past 12 months?</li>
<li>Which applications are affected by vulnerabilities that have been unpatched for this amount of time?</li>
<li>How many vulnerabilities underwent this particular state change during the past <i>N</i> weeks?</li>
</ul>
<p>The solution we are currently evaluating consists of two main components. A data warehouse provides the historical repository of the data, which is periodically updated with snapshots built by correlating the various sources of information. On top of this repository, various OLAP cubes provide an analytical interface to the warehouse, allowing security officers to answer questions by simply browsing the cubes with a pivot table.</p>
<p>The first cube I’ve been dealing with is the cube for the vulnerability management “universe” (in SAP terms), and it immediately proved to be a true design challenge for a number of reasons.</p>
<p>First of all, the dimensional model for this cube does not consist of a single fact, but rather, it encompasses a number of facts each providing its own measure(s) and each related to all of the dimensions in some way. As an example, we want measures such as number (count) of vulnerabilities, number (count) of applications, and number (count) of vulnerability state transitions, and each of these measures should be related, for example, to the <i>Vulnerability Severity</i> dimension, so that we can “plot” any of these measures against the various severities. It should be obvious that all of these measures cannot live in the same fact table, and thus the solution needs a dimensional model that deviates from the typical star/ snowflake model.</p>
<p>Second, many of the dimensions in this model are related to each other through a many-to-many relationship; consider, for example, the relationship between a host and an application, in which a host can run multiple applications and a single application can run on multiple hosts. Many-to-many relationships are usually extremely difficult to deal with in an OLAP cube – just think of the issue of counting the same thing twice through two cascaded many-to-many relationships – but the stack I’m currently considering (Microsoft BI with SQL Server Analysis Services 2005) elegantly supports many-to-many relationships through the use of “intermediate measures” (see Marco Russo’s excellent <a href="http://www.sqlbi.eu/Portals/0/Downloads/M2M%20Revolution%201.0.93.pdf" target="_blank">The many-to-many revolution whitepaper</a>).</p>
<p>Finally, many of the relationships between the dimensions are dynamic, and for some of these the business would like to be able to report things as they were at the time the snapshots were taken. As an example, the same application can be run by different hosts each month, and thus the vulnerabilities affecting the application vary over time. In these situations, questions like “<i>which vulnerabilities affect this application?</i>” do not have much sense, as they should be better put as “<i>which vulnerabilities affect this application <b>at this point in time</b>?</i>”.</p>
<p>After a lot of thinking and endless discussions with the business, I finally homed in on an elegant dimensional model for vulnerability management, which I decided to model using the <i>Dimensional Fact Model</i> (DFM) approach (described in <a href="https://dspace.ist.utl.pt/bitstream/2295/164875/1/DFM.pdf" target="_blank">Rizzi’s paper</a>). </p>
<p><b><font size="+2">Complete Fact Schema</font></b></p>
<p>The DFM for the vulnerability management “universe” is as follows:<br />
<a href="http://gabrielegiuseppini.files.wordpress.com/2011/01/vm-dimensional-model.png" target="_blank"><img src="http://gabrielegiuseppini.files.wordpress.com/2011/01/vm-dimensional-model.png?w=500&#038;h=982" alt="Vulnerability Management Dimensional Model" title="Vulnerability Management Dimensional Model" width="500" height="982" class="alignnone size-full wp-image-401" /></a><br />
In the model above I am using some terms in a way that differs from most “standard” security literature; in order to avoid misunderstandings, here are some clarifications on the terms in my ontology:</p>
<ul>
<li><b>Vulnerability</b>: the occurrence, or instance, of a <b>Vulnerability Class</b> on a specific IT device, such as a host. For example, the presence of <i>MS10-070</i> on host <i>jupiter24.acme.eu</i>.</li>
<li><b>Vulnerability Class</b>: a software weakness that exists regardless of its occurrence. For example, <i>MS10-070</i>. It goes without saying that a <b>Vulnerability Class</b> is related to a number of CVE ID’s.</li>
<li><b>Vulnerability Observation</b>: the observation of a <b>Vulnerability</b> at a specific point in time. For example, the detection of <i>MS10-070</i> on host <i>jupiter24.acme.eu</i> which took place during the scan of <i>January 3, 2010</i>.</li>
<li><b>IT Asset</b>: an application or a set of applications behaving as a larger atomic system.</li>
</ul>
<p>Some considerations on the model follow.</p>
<ul>
<li>Some of the facts in the model exist purely for the need of creating many-to-many relationships among dimensions (as explained in Russo’s whitepaper). This is true, for example, for the <i>CVEID Observation</i> fact, which models the many-to-many relationship between the <i>Vulnerability Class</i> dimension and the <i>CVEID</i> dimension, and for the <i>Host Snapshot</i> fact, which models the many-to-many relationship between the <i>Host</i> dimension and the <i>IT Asset</i> dimension. In these cases, I decided to attach a useful measure anyway, such as the total count of (distinct) hosts, so to be able to answer questions such as “<i>How many hosts are used by this IT service? </i>” without incurring double-counting issues.</li>
<li>The two date dimensions in the model – <i>Scan Date</i> and <i>Snapshot Date</i> – model two completely separate concepts. <i>Scan Date</i> contains dates of vulnerability scans, while <i>Snapshot Date</i> contains the date of the ETL snapshot. Speaking of which, we only need a single <i>Snapshot Date</i> dimension because the model is fully connected, and thus we can relate the <i>Snapshot Date</i> dimension with every fact – and consequently with all the other dimensions. I’ve arbitrarily attached the <i>Snapshot Date</i> dimension to the <i>IT Asset Snapshot</i> fact.</li>
</ul>
<p><b><font size="+2">Dynamic Relations</font></b></p>
<p>My model handles dynamic relations in two distinct ways.</p>
<p><b><font size="+1">Dimension-dimension Relations</font></b><br />
The relationships between separate dimensions, which are modeled with the use of “intermediate” facts, are treated with the “historical truth” approach, that is, they reflect the relationships that were in effect at the time of the snapshot. As an example, if in January host <i>H1</i> ran asset <i>A1</i> and in February the same host ran instead asset <i>A2</i>, then the host’s vulnerabilities in January would be aggregated to asset <i>A1</i> and the host’s vulnerabilities in February would be aggregated to asset <i>A2</i>. In order to achieve this, there’s a trick that is key to the correct modeling of time in my model, and thus it deserves some more explanations. </p>
<p>Let’s expand on the previous example and assume that the following relations were in effect in January and in February:</p>
<ul>
<li>January: IP Address <i>IP1</i> suffers from vulnerability <i>V1</i> and belongs to host <i>H1</i> which runs asset <i>A1</i>.</li>
<li>February: IP Address <i>IP1</i> suffers from vulnerability <i>V2</i> and belongs to host <i>H1</i> which runs asset <i>A2</i>.</li>
</ul>
<p>Now, let’s say that in order to save on storage space, we want to employ a <a href="http://www.ralphkimball.com/html/designtipsPDF/DesignTips2000%20/KimballDT8Perfectly.pdf" target="_blank">Kimball type-2</a> approach for our slowly-changing dimensions (SCD’s) and relationships. In other words, we do a sort of “compression” and re-use records for those dimensions and facts that haven’t changed since the last snapshot. Given that we expect only a small number of things to change from month to month, using type-2 SCD’s would allow us to save a lot of space. In this case, we would have a single dimension record for IP1 in both January and February, a single fact record for the <i>IP Address Snapshot</i> fact (the relationship between IP addresses and hosts, which didn’t change between January and February), a single dimension record for <i>H1</i> in both January and February, and two different records for each of the remaining dimensions and facts, one record for January and one record for February. The situation would look as follows:<br />
<a href="http://gabrielegiuseppini.files.wordpress.com/2011/01/compressed-graph.png" target="_blank"><img src="http://gabrielegiuseppini.files.wordpress.com/2011/01/compressed-graph.png?w=500&#038;h=166" alt="Compressed Graph" title="Compressed Graph" width="500" height="166" class="alignnone size-full wp-image-406" /></a><br />
At first glance everything seems fine; we are loading lots of data each month, but in reality we only need to store the “deltas” for things that change; we don’t need to duplicate records for <i>IP1</i>, <i>H1</i>, and their relationship, since these haven’t changed at all. However, there is one huge problem with this approach. Let’s say you want to see the vulnerabilities affecting each asset in February. You would select the <i>February</i> value from the <i>Snapshot Date</i> dimension, and then the OLAP engine would have to join its way from the <i>Snapshot Date</i> dimension down to the <i>Vulnerability</i> dimension. The first step works fine – through the February-specific record of the <i>Snapshot Date</i> &lt;&#8212;&gt; <i>IT Asset</i> relationship (<i>IT Asset Snapshot</i> fact), the engine reaches the <i>A2</i> record; cool. Now, from the February-specific record of the <i>IT Asset</i> &lt;&#8212;&gt; <i>Host</i> relationship (<i>Host Snapshot</i> fact), the engine reaches the month-agnostic <i>H1</i> record. Not cool! <i>H1</i> is not month-specific and so the engine has now no way of “propagating” the month-specific-ness from <i>A2</i> to the IP addresses; in fact, from now on <i>H1</i> is linked to <i>IP1</i> which appears to be linked to <b>both</b> <i>V1</i> and <i>V2</i>, as the OLAP engine does not know that it has to use the original choice of the month for the <i>Snapshot Date</i> dimension in its <i>Vulnerability</i> &lt;&#8212;&gt; <i>IP Address</i> table join. We could solve this by attaching another <i>Snapshot Date</i> dimension to the <i>Vulnerability</i> &lt;&#8212;&gt; <i>IP Address</i> relationship (i.e. to the <i>Vulnerability Snapshot</i> fact), but that would make the query cumbersome: you’d have to select <b>two</b> date dimensions and then select the <b>same</b> value for both dimensions.</p>
<p>Seen from another angle, the issue is that the OLAP engine (at least SSAS’s) “walks” the graph of dimension-fact relationships by means of table joins with simple field equality predicates (i.e. “<i>table1.fk = table2.pk</i>”); in order to “propagate” the month choice, you would have to tell it to consider also the month in the join predicate, which unfortunately you can’t do. The simplest way to ensure separation of time is thus to have completely separate graphs for each snapshot, even for arcs and nodes that do not change; this way, “anchoring” a date with the single <i>Snapshot Date</i> dimension means selecting a single instance of the graph – the one that models the universe at that date – and so that “anchor” can be propagated to the whole graph by means of its topology only. In other words, this is equivalent to including the <i>Snapshot Date</i> dimension value in each primary key involved in the graph joins, with the effect that each record in each fact table and each record in each dimension table becomes unique per-snapshot. To achieve this, during each ETL the records in the dimension tables are completely re-created from scratch, and fact records (i.e. relations) are completely re-created from scratch using the surrogate keys of the new dimension records. The same situation of before would then look like this:<br />
<a href="http://gabrielegiuseppini.files.wordpress.com/2011/01/uncompressed-graph.png" target="_blank"><img src="http://gabrielegiuseppini.files.wordpress.com/2011/01/uncompressed-graph.png?w=500&#038;h=166" alt="Uncompressed Graph" title="Uncompressed Graph" width="500" height="166" class="alignnone size-full wp-image-407" /></a><br />
There is of course some degree of waste of space – after all, <i>H1</i> hasn’t really changed, yet we duplicate its records in January and in February. This waste of space, however, can be significantly contained by ensuring that the actual metadata associated with <i>H1</i> (i.e. the values of its hierarchies’ attributes) live in a separate “dimension metadata” table, treated as a type-2 SCD; the <i>Host</i> dimension record contains the foreign key to this metadata table, rather than the metadata itself, and thus we reduce the size of the dimension records to two fields – the dimension’s surrogate key and the dimension’s metadata table key.</p>
<p>You could argue that there is still room for space optimization here. In our previous example, if <i>IP1</i> suffered from <i>V1</i> in <b>both</b> January and February, then we could have stored the <i>Vulnerability</i> &lt;&#8212;&gt; <i>IP Address</i> &lt;&#8212;&gt; <i>Host</i> chain only once, and “fork” only at the <i>Host</i> &lt;&#8212;&gt; <i>IT Asset</i> arc, as shown in the following diagram.<br />
<a href="http://gabrielegiuseppini.files.wordpress.com/2011/01/compressed-graph-2.png" target="_blank"><img src="http://gabrielegiuseppini.files.wordpress.com/2011/01/compressed-graph-2.png?w=500&#038;h=156" alt="Optimized Graph" title="Optimized Graph" width="500" height="156" class="alignnone size-full wp-image-409" /></a><br />
In more formal terms, if you pivot the model’s graph to look like a tree rooted at the <i>Snapshot Date</i> dimension node, then you could say that we can treat as a type-2 SCD each complete sub-tree that hasn’t changed between ETL periods. The problem, however, is that doing this form of “constant sub-tree detection” could be quite complex to perform in an ETL, and so I would personally choose to duplicate the entire graph (tree) and waste some space rather than risking incorrect reporting due to bugs in the ETL process.</p>
<p><b><font size="+1">Attribute-attribute Relations</font></b><br />
Differently than dimension-dimension relationships, the relationships between the attributes in the hierarchies within a dimension are treated as “today for yesterday”, that is, the latest situation replaces the relationships in the past (effectively a type-1 SCD). As an example, if in January the IT service <i>S1</i> belonged to <i>Switzerland</i> and in February the same IT service belongs to <i>China</i>, then after February the <i>S1</i> vulnerabilities in January would be aggregated to <i>China</i>. The reason for this behavior is that these types of changes are assumed to be “improvements” in our knowledge of the universe, and thus corrections in our knowledge are assumed to be retroactive. At the same time, treating attributes and their relations as type-1 SCD’s allows us to save storage space, which is especially good in light of the fact that, as we have seen, we need to completely duplicate dimension and fact records at each snapshot. It’s worthy to note that the underlying data warehouse still stores the information as type-2 SCD’s, in order to support ad-hoc forensic investigations; it’s only the OLAP cube that models these are type-1 SCD’s. </p>
<p><i>Published with full permission of my client.</i></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/392/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/392/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/392/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/392/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/392/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/392/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/392/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/392/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/392/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/392/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/392/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/392/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/392/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/392/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=392&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2011/01/25/a-dimensional-model-for-vulnerability-management/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>

		<media:content url="http://gabrielegiuseppini.files.wordpress.com/2011/01/vm-dimensional-model.png" medium="image">
			<media:title type="html">Vulnerability Management Dimensional Model</media:title>
		</media:content>

		<media:content url="http://gabrielegiuseppini.files.wordpress.com/2011/01/compressed-graph.png" medium="image">
			<media:title type="html">Compressed Graph</media:title>
		</media:content>

		<media:content url="http://gabrielegiuseppini.files.wordpress.com/2011/01/uncompressed-graph.png" medium="image">
			<media:title type="html">Uncompressed Graph</media:title>
		</media:content>

		<media:content url="http://gabrielegiuseppini.files.wordpress.com/2011/01/compressed-graph-2.png" medium="image">
			<media:title type="html">Optimized Graph</media:title>
		</media:content>
	</item>
		<item>
		<title>OWASP and Input Validation</title>
		<link>http://gabrielegiuseppini.wordpress.com/2010/05/26/owasp-and-input-validation/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2010/05/26/owasp-and-input-validation/#comments</comments>
		<pubDate>Wed, 26 May 2010 10:01:16 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Rants]]></category>
		<category><![CDATA[Software Security]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=379</guid>
		<description><![CDATA[As a follow-up to my previous post, here&#8217;s another example of OWASP&#8217;s &#8220;authoritative&#8221; prescriptive guidance that gives developers advice that is, in my humble opinion, dangerously wrong, and which contributes in building that sort of &#8220;parrot security expertise&#8221; &#8211; i.e. expertise that is based on repeating nonsensical mantras &#8211; which you see unfortunately way too [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=379&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>As a follow-up to my <a href="http://gabrielegiuseppini.wordpress.com/2010/04/06/input-validation-injection-vulnerabilities/" target="_blank">previous post</a>, here&#8217;s another example of OWASP&#8217;s &#8220;<i>authoritative</i>&#8221; prescriptive guidance that gives developers advice that is, in my humble opinion, dangerously wrong, and which contributes in building that sort of &#8220;parrot security expertise&#8221; &#8211; i.e. expertise that is based on repeating nonsensical mantras &#8211; which you see unfortunately way too often in the software security field.</p>
<p>From the <a href="http://www.owasp.org/index.php/Injection_Prevention_Cheat_Sheet" target="_blank">Injection Prevention Cheat Sheet</a> at <a href="http://www.owasp.org" target="_blank">owasp.org</a>:</p>
<blockquote><p><i><br />
<font size="+2">Injection Prevention Rules</font></p>
<p><font size="+1">Rule #1 (Perform proper input validation):</font><br />
Perform proper input validation. Positive or “whitelist” input validation with appropriate canonicalization is also recommended, but is not a complete defense as many applications require special characters in their input. </p>
<p><font size="+1">Rule #2 (Use a safe API):</font><br />
The preferred option is to use a safe API which avoids the use of the interpreter entirely or provides a parameterized interface. Be careful of APIs, such as stored procedures, that are parameterized, but can still introduce injection under the hood. </p>
<p><font size="+1">Rule #3 (Contextually escape user data):</font><br />
If a parameterized API is not available, you should carefully escape special characters using the specific escape syntax for that interpreter.<br />
</i></p></blockquote>
<p>Notice how input validation is listed as the first rule here. Basically, OWASP is saying that &#8220;<i>if you can, you should put a patch in front of it</i>&#8220;, rather than actually fix the bug itself. So much for good engineering, this is security done by sprinkling some dust to patch holes here and here.</p>
<p>Fixing the bug itself &#8211; i.e. escaping the data that the developer forgot to escape to start with &#8211; is only listed as the third rule, to be used in case the other two fail. </p>
<p>If I were a developer seeking advice on how to write secure software, what would I take from this? I would probably remain firm in my opinion that writing secure software is just a matter of using security features here and there, such as the validators offered by my programming framework, rather than being a matter of creating solid code that is developed correctly in the first place. So, I can continue writing my sloppy code, and then add validators here and there to make my software robust.</p>
<p>Now, some of the OWASP people counter-argument that &#8220;<i>you live in a fairy-tale world; in the real world, there are sloppy developers and you need an approach that secures their code too</i>&#8220;. Really? This is exactly the same as saying &#8220;<i>you live in a fairy-tale world; in the real world, there are developers who write a sort function with cubic complexity and you need an approach that speeds up their code too</i>&#8220;. So, the reasoning goes, OWASP is saying that instead of studying algorithm theories and requiring that our developers read Knuth at least once in their lives, we should all write sorting algorithms that run in cubic time, and &#8220;if we can&#8221;, just run this sloppy code on beefy boxes, and we&#8217;ll all be happy with that. I&#8217;m sure that on a 360-teraflops box my cubic sort will be as fast as sloppy code is secure with a regex validator in front of it.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/379/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/379/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/379/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/379/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/379/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/379/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/379/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/379/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/379/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/379/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/379/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/379/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/379/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/379/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=379&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2010/05/26/owasp-and-input-validation/feed/</wfw:commentRss>
		<slash:comments>6</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>
	</item>
		<item>
		<title>Input Validation &amp; Injection Vulnerabilities</title>
		<link>http://gabrielegiuseppini.wordpress.com/2010/04/06/input-validation-injection-vulnerabilities/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2010/04/06/input-validation-injection-vulnerabilities/#comments</comments>
		<pubDate>Tue, 06 Apr 2010 17:07:26 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Rants]]></category>
		<category><![CDATA[Software Security]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=352</guid>
		<description><![CDATA[Nothing pisses me off more quickly and surely than a security consultant boasting that &#8220;you can solve your [insert favorite injection vulnerability here] issue with proper input validation&#8221;. OWASP does this all the time, with preposterous sentences like the following, taken from this page: Input validation is absolutely critical to application security, and most application [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=352&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Nothing pisses me off more quickly and surely than a security consultant boasting that <i>&#8220;you can solve your [insert favorite injection vulnerability here] issue with proper input validation&#8221;</i>.</p>
<p>OWASP does this all the time, with preposterous sentences like the following, taken from <a href="http://www.owasp.org/index.php/Category:Input_Validation">this page</a>:</p>
<blockquote><p><i>Input validation is absolutely critical to application security, and most application risks involve tainted input at some level.</i></p></blockquote>
<p>This is what happens when you take a couple of graduates out of school and thrust them into the client’s dev team as the “security consultants”. And, probably, nothing has contributed more to damage the reputation of security consultants as this claim has been doing for the past tens of years. Advocating input validation as the first line of defense against injection vulnerabilities is like Microsoft forbidding users from typing “for”, “if”, or “while” in Office Word in fear that a victim could execute arbitrary code by feeding a Word document to a compiler. It&#8217;s like preventing race conditions by carefully avoiding to spawn any threads, or preventing buffer overflows by ensuring that all buffers are &#8220;kept small&#8221;.</p>
<p>Injection vulnerabilities – i.e. those nifty vulnerabilities like cross-site scripting (XSS), SQL injection, XPath injection, OS command injection, and so on – are caused by one simple thing: the dumbness of a developer not seeing that he’s using <b>one</b> channel (i.e. an HTML page, or a SQL query) to convey <b>two</b> types of content (i.e. rendering directives and text content, or SQL statements and literal parameters). A 5-year old kid will tell you that if you want to shove two separate messages into the same channel, you&#8217;ll need to use some type of encoding to keep them separate. A real developer knows this and makes sure that the two messages are kept separate by encoding the one that&#8217;s supposed to be encoded. A dumb (or lazy) developer doesn’t and ends up mixing things up. When the lazy developer concatenates strings to build HTML and forgets to HTML-encode the text in order to avoid it being parsed as rendering directives, what’s the bug? Is it lack of encoding, as plain software engineering thinking would suggest, or is it “bad, nasty, malicious input”, as most security consultants keep swearing? In other words, do injection vulnerabilities originate from lack of separation between messages in the same channel, or do they originate from the evil nature of the messages being mixed up? Well, if it were the latter, then the various RFC’s governing the structure of HTML, SQL, and the like would not have spent time telling you how to encode things. They would have said <i>“Oh, by the way: you can’t have an HTML page that uses the &#8216;&lt;&#8217; or &#8216;&gt;&#8217; characters, we are sorry. HTML is not for math, buddy.”</i></p>
<p>Despite these obvious observations, many security consultants still advocate the use of input validation as the first line of defense against injection vulnerabilities. Input validation? Sure, show me. I have this forum in which people post arbitrary messages. Should I prohibit my users from posting math inequalities, which make use of &#8216;&lt;&#8217; and &#8216;&gt;&#8217;? What about &#8216;&amp;&#8217;? I should forbid that one as well, right? And what will happen when my users will try to edit their previous posts after I’ve implemented your input validation? Posts that were legitimate and innocuous yesterday all of a sudden become invalid and can&#8217;t be edited.</p>
<p>And what should I do with the “<i>last_name</i>” field of my form? I’m afraid of SQL injection, should I forbid the use of the single-quote character? Ooops, just got a call from support: we’ve just cut off all of our Irish customers – O’Brian, O’Bannon, and O’Sullivan, they can’t login anymore. How could anyone allow the Irish to have those dangerous, evil, malicious characters in their last names? Oh wait, is the single-quote character good now? I don’t get it, it was bad the day before yesterday and now instead you tell me it’s good. Do you have the slightest idea of what you’re talking about?</p>
<p>And finally, my application saves the user&#8217;s profile to the backend database. I&#8217;m not sure what happens next, the other department &#8211; or the other company &#8211; takes care of that. They might use my data to build HTML, sure, so let&#8217;s forbid &#8216;&lt;&#8217;, &#8216;&gt;&#8217;, and &#8216;&amp;&#8217;. And since we don&#8217;t know whether they use AJAX, let&#8217;s be safe and forbid &#8216;{&#8216;, &#8216;}&#8217;, and the double-quote character. You never know. Oh, they might also use this data to build SQL queries, of course, so let&#8217;s forbid the single-quote character. And since they often generate CSV log files, I&#8217;d say that &#8216;,&#8217; should be illegal. And since their sub-system might also generate space-separated log files, I&#8217;m better safe than sorry &#8211; let me forbid the space character. That character is pure evil, I always suspected that. </p>
<p>What&#8217;s worse, the security consultant will not only tell you that you have to <i>filter</i> and <i>validate</i> your input. He will actually tell you <i>how</i> you have to do it: be careful, it&#8217;s all about black-listing versus white-listing. Black-listing is bad, as it&#8217;s likely that you forget some characters and end up with a security hole. Which would, of course, be caused by that single character that you forgot, not by the complete lack of output encoding throughout your code. The fact that you have been put in charge of writing a Web application because you googled &#8220;Perl&#8221; one day and learnt how to serve a Web page has absolutely <b>nothing</b> to do with the fact that you are riddled with injection vulnerabilities. It&#8217;s the character that you forgot to filter out, that&#8217;s the culprit.</p>
<p>So, it&#8217;s always better to use white-listing, because the worse that can happen is that all your users from Ireland, Italy, Spain, France, Germany, and any other country that uses characters other than those 26 you shoved in your white-list regular expression will be cut off from your Web app. Better safe than sorry, dude. Never mind that I lost a few million customers. </p>
<p>Oh, by the way, security consultant, look, WordPress.com allowed me to type &#8216;&amp;&#8217; in the title of my post. It must be vulnerable to cross-site scripting then.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/352/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/352/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/352/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/352/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/352/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/352/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/352/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/352/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/352/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/352/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/352/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/352/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/352/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/352/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=352&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2010/04/06/input-validation-injection-vulnerabilities/feed/</wfw:commentRss>
		<slash:comments>6</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>
	</item>
		<item>
		<title>Chip? No, Thanks</title>
		<link>http://gabrielegiuseppini.wordpress.com/2010/02/16/chip-no-thanks/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2010/02/16/chip-no-thanks/#comments</comments>
		<pubDate>Tue, 16 Feb 2010 13:33:08 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Rants]]></category>
		<category><![CDATA[Software Security]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=347</guid>
		<description><![CDATA[An awesome example of how banks and the payment industry try to screw customers. In the Netherlands, the payment cards handed off by the banks usually have a magnetic strip and a smart card chip. However, stores here gladly accept the magnetic strip and I&#8217;ve never been requested to use the chip. I only had [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=347&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>An awesome example of how banks and the payment industry try to screw customers.</p>
<p>In the Netherlands, the payment cards handed off by the banks usually have a magnetic strip <em>and</em> a smart card chip. However, stores here gladly accept the magnetic strip and I&#8217;ve never been requested to use the chip. I only had to use the chip in Italy, where angry clerks would say &#8220;no, turn the card around, can&#8217;t you see the chip?&#8221;. Apparently, the chip is going big everywhere else in Europe, and banks are marketing the EMV system &#8211; the technology behind the payment with the chip &#8211; as a safe way to prevent fraud.</p>
<p>They don&#8217;t tell you, however, that the EMV system has a serious design flaw which makes the magnetic strip a safer alternative. And they don&#8217;t tell you that if you get frauded, you lose the money.</p>
<p>My colleague Radi, in fact, just wrote a post about <a href="http://radi.r-n-d.org/2010/02/plastic-problems.html" target="_blank">this paper </a>that shows how a stolen chip card can be used in a successful transaction without knowing the PIN. Thew worst part is that by using this method, the bank thinks that the PIN <strong>was</strong> used during the transaction, and if the bank thinks you used the PIN, then, legally, <em>you</em> are responsible for the fraud.</p>
<p>The whole issue behind this flaw is that the PIN verification is left to the card, and the bank never sees the PIN. This is different from the magnetic strip method, in which the PIN is sent &#8211; encrypted &#8211; to the bank for verification.</p>
<p>This is why I just put a piece of plastic tape on my chip. The next time a clerk takes my card and inserts it face down in the chip reader, I will smile and say &#8220;No, thanks &#8211; the chip is broken&#8221;.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/347/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/347/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/347/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/347/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/347/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/347/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/347/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/347/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/347/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/347/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/347/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/347/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/347/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/347/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=347&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2010/02/16/chip-no-thanks/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>
	</item>
		<item>
		<title>Touchdown: HiSam Will Be Going Live Soon!</title>
		<link>http://gabrielegiuseppini.wordpress.com/2010/02/12/touchdown-hisam-will-be-going-live-soon/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2010/02/12/touchdown-hisam-will-be-going-live-soon/#comments</comments>
		<pubDate>Fri, 12 Feb 2010 10:45:07 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Text Analytics]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=334</guid>
		<description><![CDATA[A few days ago I finished the exhausting re-labeling effort that I had talked about previously, and I started running tests to see where I finally stand against my stated goal. At first the results were a bit discouraging. With the final set of labeled documents – 2,000 sentences added to the previous 2,500 sentences [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=334&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>A few days ago I finished the exhausting <a href="//gabrielegiuseppini.wordpress.com/2010/01/24/improving-the-model/">re-labeling effort</a> that I had talked about previously, and I started running tests to see where I finally stand against my <a href="//gabrielegiuseppini.wordpress.com/2008/12/18/my-hmm-goal/">stated goal</a>.</p>
<p>At first the results were a bit discouraging. With the final set of labeled documents – 2,000 sentences added to the previous 2,500 sentences – the overall F score went down a bit. I was kind of expecting this though, as the more training data from heterogeneous domains you add to the mix, the more variance you have in the thing being learned, resulting in both more potential for errors and more difficulty in learning. This fact is nicely explained by the following excerpt from <a href="//cacm.acm.org/magazines/2010/2/69354-a-few-billion-lines-of-code-later/fulltext">this paper</a> on static code analysis tools:</p>
<blockquote><p><em>The result of summing many independent random variables? A Gaussian distribution, most of it not on the points you saw and adapted to in the lab. Furthermore, Gaussian distributions have tails. As the number of samples grows, so, too, does the absolute number of points several standard deviations from the mean. The unusual starts to occur with increasing frequency.</em></p></blockquote>
<p>The final overall F score – 0.864 at 90% – is still far away from my goal of 0.900 at 60%, and the entity-specific F scores (e.g. 0.900 for <em>GeoLocation</em> entities and 0.915 for <em>Person</em> entities) are far from the F scores boasted by research projects in entity extraction – which are all around 0.93.</p>
<p>So, this very morning I decided to do a real-world test: I took a few articles from CNN, fed these to my HMM, and observed the results. I was astonished!!!!!! The little guy did extremely well with these pieces of text it had never seen before. Here are a few examples – colors correspond to entity types and numbers indicate the probabilities of the extracted entities:</p>
<p>Example 1:</p>
<blockquote><p><em>Greene: &#8221; This is about the limitless capacity of the human heart. &#8221; Bob Greene says a small town in Ohio is one of the most inspiring places in the United States.</em></p></blockquote>
<ul>
<li><span style="color:#ff0000;">Greene</span> (Person: 5.99677679935634E-05)</li>
<li><span style="color:#ff0000;">Bob Greene</span> (Person: 1.25848620925595E-07)</li>
<li><span style="color:#0b00e0;">Ohio</span> (GeoLocation: 0.001397929451232)</li>
<li><span style="color:#0b00e0;">United States</span> (GeoLocation: 0.00286421623850843)</li>
</ul>
<p>Example 2:</p>
<blockquote><p><em>Until, on July 20, 1969, Neil Armstrong, of Wapakoneta, walked on the moon.</em></p></blockquote>
<ul>
<li><span style="color:#ff00ff;">July 20 , 1969</span> (Time: 0.000150556184495453)</li>
<li><span style="color:#ff0000;">Neil Armstrong</span> (Person: 1.33506658714912E-07)</li>
<li><span style="color:#0b00e0;">Wapakoneta</span> (GeoLocation: 6.91960283284816E-07)</li>
<li><span style="color:#808080;">moon </span>(AstronomicalPlace: 0.351350422734393)</li>
</ul>
<p>Example 3:</p>
<blockquote><p><em>A soldier mans a weapon at the rear of a U.S. Army helicopter over Afghanistan in May.</em></p></blockquote>
<ul>
<li><span style="color:#008000;">U.S. Army </span>(Organization: 3.40883387762237E-06)</li>
<li><span style="color:#0b00e0;">Afghanistan</span> (GeoLocation: 0.000349482362808)</li>
<li><span style="color:#ff00ff;">May</span> (Time: 0.00299625468107284)</li>
</ul>
<p>Example 4:</p>
<blockquote><p><em>Senate Judiciary Committee considers Sotomayor nomination on Tuesday.</em></p></blockquote>
<ul>
<li><span style="color:#008000;">Senate Judiciary Committee</span> (Organization: 3.66993148614553E-10)</li>
<li><span style="color:#ff0000;">Sotomayor</span> (Person: 5.60905081933395E-07)</li>
<li><span style="color:#ff00ff;">Tuesday</span> (Time: 0.0389513108539469)</li>
</ul>
<p>So, why the poor F score and the good results? Well, I think I’ve found the explanation. As I said <a href="//gabrielegiuseppini.wordpress.com/2008/12/18/how-i-calculate-the-hmm-performance/">here</a>, when I calculate the performance of my HMMs I’m being Nazi with myself: all the papers I’ve read, in fact, count the number of <em>tokens</em> correctly tagged by their systems, while I count the number of correct <em>tags</em>. This means that when my HMM extracts “Ohio” from “I’m going to Northern Ohio”, I count that as zero <em>recall</em> – the expected tag is “Northern Ohio” and my guy hasn’t found it. On the other hand, research papers would count that as one token out of two, which yields a 0.5 <em>recall</em>.</p>
<p>With this in mind, the results are so good that I’ve decided to set in motion the “release” machine. It took me a couple of years but the first piece of HiSam will finally be live soon!!!!</p>
<p>These are the last TODO items before I start working on the commercial offering:</p>
<ol>
<li>Add an option to calculate the F score using the research papers’ method, and compare this score to their score;</li>
<li>Label a few more documents in order to reach better stability and see whether the learning curve shifts up;</li>
<li>Compress the XML serialization of the model – the current XML takes up 800Mb of disk space and takes forever to load…</li>
</ol>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/334/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/334/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/334/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/334/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/334/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/334/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/334/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/334/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/334/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/334/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/334/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/334/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/334/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/334/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=334&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2010/02/12/touchdown-hisam-will-be-going-live-soon/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>
	</item>
		<item>
		<title>Canary Hell</title>
		<link>http://gabrielegiuseppini.wordpress.com/2010/02/08/canary-hell/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2010/02/08/canary-hell/#comments</comments>
		<pubDate>Mon, 08 Feb 2010 11:27:57 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Rants]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=330</guid>
		<description><![CDATA[I&#8217;m finally back from my London stay, and I&#8217;m finding the time to answer a question that many asked me with incredulity: how it comes it took you 1 and ½ hours to commute between Chelsea and Canary Wharf?!?!? Guess what, this is the Canary Wharf tube station at the time I used to get [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=330&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I&#8217;m finally back from my London stay, and I&#8217;m finding the time to answer a question that many asked me with incredulity: how it comes it took you 1 and ½ hours to commute between Chelsea and Canary Wharf?!?!?</p>
<p>Guess what, this is the Canary Wharf tube station at the time I used to get there:<br />
<div id="attachment_331" class="wp-caption alignnone" style="width: 370px"><a href="http://gabrielegiuseppini.files.wordpress.com/2010/02/img_0371-small.jpg"><img class="size-full wp-image-331" title="Canary Wharf Station" src="http://gabrielegiuseppini.files.wordpress.com/2010/02/img_0371-small.jpg?w=360&#038;h=480" alt="" width="360" height="480" /></a><p class="wp-caption-text">(photo by Radi)</p></div></p>
<p>IMHO, getting from one point to the other among this crowd in 1 and ½ hours seems an achievement to me…but still, I had to endure skeptic eyes over and over while I was complaining about the commute. </p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/330/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/330/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/330/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/330/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/330/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/330/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/330/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/330/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/330/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/330/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/330/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/330/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/330/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/330/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=330&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2010/02/08/canary-hell/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>

		<media:content url="http://gabrielegiuseppini.files.wordpress.com/2010/02/img_0371-small.jpg" medium="image">
			<media:title type="html">Canary Wharf Station</media:title>
		</media:content>
	</item>
		<item>
		<title>Improving the Model</title>
		<link>http://gabrielegiuseppini.wordpress.com/2010/01/24/improving-the-model/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2010/01/24/improving-the-model/#comments</comments>
		<pubDate>Sun, 24 Jan 2010 10:38:44 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Text Analytics]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=322</guid>
		<description><![CDATA[For the past couple of months I’ve paused further development of the model itself (the “MultiEntity” model) in order to focus on a last round of re-labeling of the training data with three goals: Label new types of entities (monetary values and time expressions) together with the three “classic” ones (person names, geographical places, and [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=322&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://gabrielegiuseppini.files.wordpress.com/2010/01/imprvchartjan242010.jpg"></a>For the past couple of months I’ve paused further development of the model itself (the “MultiEntity” model) in order to focus on a last round of re-labeling of the training data with three goals:</p>
<ol>
<li>Label new types of entities (monetary values and time expressions) together with the three “classic” ones (person names, geographical places, and organizations);</li>
<li>Be more strict with my labeling and make sure to adhere to a set of guidelines that I’ve implemented with the goal of ensuring consistency of the training data;</li>
<li>Enable a new mechanism that takes advantage of the inner structure of certain entity names – which is proving to be the key difference in reaching very good performance and on which I’m not yet ready to publicly elaborate.</li>
</ol>
<p>At the same time I have also re-tokenized the training text being labeled in order to take advantage of some improvements I had done in the past years in the tokenization module.</p>
<p>On December 22 I reached a milestone – 60% of the “old” training data re-tokenized and re-labeled, about 2,900 sentences – and I decided it was time to pause the re-labeling effort and see whether I was going towards the right direction.</p>
<p>The first performance numbers were encouraging, but not as better than those before the re-labeling effort as I hoped they’d be. In order to investigate the reason for the modest improvement, I’ve employed a very useful technique that I had used earlier with a similar goal, a technique that I call “self-testing” and which I think I heard about in the machine learning literature.</p>
<p>Ideally, when you test your trained model on the same data that you have used to train it, you should see no errors in the model’s predictions. It’s kinda like asking a student to repeat the pages of the schoolbook she has just studied. In reality, however, the model does make some errors, exactly like the human student does <img src='http://s2.wp.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> , and these errors can be attributed to one of two different causes:</p>
<ol>
<li><em>Noisy Training</em>: the training data is not consistent because of some mistakes that took place during the manual labeling, and the learning algorithm is confused by these mistakes. Think of a student being told on Monday that 2 + 2 is 4, and then on Tuesday that 2 + 2 is 5. In my case, it could be that “China” has been tagged as a <em>GeoLocation</em> in one training sentence and erroneously as a <em>Person</em> – or not tagged at all – in another sentence.</li>
<li><em>Limited Learning Capability</em>: the model is unable to learn from the training data due to limitations inherent to its design. Think of a primary school student being told that she can integrate Schrödinger’s wave function to get the probability that a particle is at X, Y, Z and has moment m. In my case, “Capitol Hill” might have been labeled as a <em>GeoLocation</em> in “The teacher lives on Capitol Hill” and as an <em>Organization</em> in “Last week Capitol Hill passed the bill”, and the model might not be considering enough context (prefix and suffix) in order to be able to discern these two different meanings of “Capitol Hill”.</li>
</ol>
<p>When I originally ran the self-test before Christmas – on the 2,900 sentences re-labeled so far – the model came back with hundreds of errors. I spent most of the holidays’ development time analyzing the errors and improving the model, with results that were encouraging by the day. This chart shows the daily changes in the average F score of the model when trained with 90% of the re-labeled data and tested on the remaining 10%:</p>
<p><a href="http://gabrielegiuseppini.files.wordpress.com/2010/01/imprvchartjan242010.jpg"><img title="Model Improvements during 09/10 Holidays" src="http://gabrielegiuseppini.files.wordpress.com/2010/01/imprvchartjan242010.jpg?w=481&#038;h=253" alt="Model Improvements during 09/10 Holidays" width="481" height="253" /></a></p>
<p>The improvements shown in the graph are due to a combination of interventions.</p>
<p>First of all, the self-test pointed me to a number of labeling errors, which I promptly fixed. When the training data became error-free, I attacked the problem of dealing with tokens containing numbers (e.g. “340”), which I never had reason to worry about before, ending up with the huge improvement in the performance of the newly-introduced <em>Currency</em> entity.</p>
<p>Finally, my “secret” recipe kicked-in. Leveraging the flexible configurability of the model and fine-tuning it based on analyses of the errors allowed me to obtain the improvements shown with the <em>Person</em>, <em>GeoLocation</em>, and <em>Organization</em> entities. This novel technique I’m using exploits the internal structure of certain entities and takes advantage of the fact that these different entities all “live” in the same model. As an example, consider <em>Organization</em> entities like “Bank of Japan” and “Bank of England”. If the model is capable of understanding that the third token of these entities is always a <em>GeoLocation</em>, then it will be more inclined to flag “Bank of Italy” as an <em>Organization</em> when it sees it for the first time, provided that its vocabulary of <em>GeoLocation</em> literals is comprehensive enough to flag Italy as a <em>GeoLocation</em>. Similarly, the model has been trained to discern, for example, between one-word <em>GeoLocation</em> entities (like “China” and “Italy”) and two-word <em>GeoLocation</em> entities (like “South Korea” and “Northern Ireland”). By being able to make this distinction, the model will be less inclined to flag “Northern” alone as a <em>GeoLocation</em> – which is exactly what used to happen before my intervention. As one would expect, the number of states has exploded from about 300 to 1,280 after all the fine-tuning, but thanks to Freitag’s and McCallum’s interpolation of emission probabilities, I haven’t experienced any penalty from the fragmentation of states.</p>
<p>All of this contributed to an overall improvement of the F score from 0.795 to 0.869, with <em>Organization</em> entities alone improving to 0.824, well above the best score of 0.755 that I was capable of obtaining last October before the re-labeling effort and before the finalization of my “secret recipe”. Moreover, the current performance of <em>Person</em> entities is exactly as it was when the old model was trained with 2,000 more training sentences.</p>
<p>At this moment all I need to do is complete the re-labeling – about 2,000 sentences left – and hope that the new training data raises the F towards my <a href="http://gabrielegiuseppini.wordpress.com/2008/12/18/my-hmm-goal/">goal</a>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/322/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/322/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/322/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/322/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/322/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/322/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/322/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/322/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/322/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/322/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/322/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/322/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/322/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/322/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=322&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2010/01/24/improving-the-model/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>

		<media:content url="http://gabrielegiuseppini.files.wordpress.com/2010/01/imprvchartjan242010.jpg" medium="image">
			<media:title type="html">Model Improvements during 09/10 Holidays</media:title>
		</media:content>
	</item>
		<item>
		<title>Google, Synonyms, and Coca-Cola</title>
		<link>http://gabrielegiuseppini.wordpress.com/2009/12/09/google-synonyms-and-coca-cola/</link>
		<comments>http://gabrielegiuseppini.wordpress.com/2009/12/09/google-synonyms-and-coca-cola/#comments</comments>
		<pubDate>Wed, 09 Dec 2009 12:52:00 +0000</pubDate>
		<dc:creator>Gabriele Giuseppini</dc:creator>
				<category><![CDATA[Text Analytics]]></category>

		<guid isPermaLink="false">http://gabrielegiuseppini.wordpress.com/?p=307</guid>
		<description><![CDATA[A few days ago I was googling for &#8220;security CCE-263&#8243; (I was looking for MITRE stuff) and I got back results that showed &#8220;Coca-Cola&#8221; in bold, as if I had searched for that term. Weird, I thought. I soon realized that &#8220;security&#8221; had nothing to do with it, and so I searched for sugar CCE-foo, [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=307&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>A few days ago I was googling for &#8220;security CCE-263&#8243; (I was looking for MITRE stuff) and I got back results that showed &#8220;Coca-Cola&#8221; in bold, as if I had searched for that term. Weird, I thought. I soon realized that &#8220;security&#8221; had nothing to do with it, and so I searched for <a href="http://www.google.com/search?hl=en&amp;q=sugar+CCE-foo&amp;aq=f&amp;oq=&amp;aqi=" target="_blank">sugar CCE-foo</a>, getting results like this:</p>
<blockquote>
<h3><span style="color:blue;text-decoration:underline;">Free <b>coca cola sugar</b> packet Download</span></h3>
<p>Free <b>coca cola sugar</b> packet Download at WareSeeker.com &#8211; Colasoft Packet Player <b>&#8230;</b> <b>foo</b> packet decoder ac3 is a lightweight and useful add-on for foobar2000 <b>&#8230;</b></p>
<h3><span style="color:blue;text-decoration:underline;">Why have i stopped drinking <b>coca cola</b>? « <b>Foo&#39;s</b> blog</span></h3>
<p>11 Oct 2009 <b>&#8230;</b> Firstly it was never good for the health, it has a ton of <b>sugar</b> and caffeine. <b>&#8230;</b> Companies like <b>Coca cola</b>  do not operate democratically, <b>&#8230;</b></p>
<h3><span style="color:blue;text-decoration:underline;">Jones Soda &#8211; Wikipedia, the free encyclopedia</span></h3>
<p>By April 2007, all of the company&#39;s products switched to cane <b>sugar</b>, <b>&#8230;..</b> The Seahawks previously sold soft drinks from The <b>Coca-Cola</b> Company; <b>&#8230;</b>
</p></blockquote>
<p>After a few seconds I realized that the actual name of the Coca-Cola company is <i>&#8220;Coca-Cola Entreprises&#8221;</i>, or <i>&#8220;CCE&#8221;</i> for short. So it appears that Google is seeing &#8220;CCE&#8221; in my query and searching for &#8220;CCE&#8221; and &#8220;Coca-Cola&#8221; at the same time. Now, my question is: is Google doing this for *everything*, or is it only doing it for named entities (i.e. persons, organizations, geographic locations, etc.)? Moreover, is it doing this with acronyms only or also with generic highly-correlated words?</p>
<p>To answer the first question, I searched for other organization acronyms that I thought would be pretty common, checking the results to see if I could see the full name of the entity returned as a keyword, i.e. in bold. I tried with &#8220;SEC&#8221;, &#8220;FBI&#8221;, &#8220;CIA&#8221;, and &#8220;EPA&#8221;, and in no case I got back the full name of the entity as a keyword. Check it out &#8211; compare <a href="http://www.google.com/search?q=EPA+offices&amp;hl=en&amp;sa=2" target="_blank">&#8220;EPA offices&#8221;</a> with <a href="http://www.google.com/search?q=CCE+offices&amp;hl=en&amp;sa=2" target="_blank">&#8220;CCE offices&#8221;</a> and see how &#8220;Coca-Cola&#8221; is the only full company name that is returned as a keyword.</p>
<p>To answer the last question, I tried to search for &#8220;event viewer microsoft&#8221; in the hope that being <i>&#8220;microsoft&#8221;</i> and <i>&#8220;windows&#8221;</i> probably highly correlated, Google would return entries containing <i>&#8220;windows&#8221;</i> in lieu of <i>&#8220;microsoft&#8221;</i>; this is not the case though (as one would expect!), as the <a href="http://www.google.com/search?q=event+viewer+microsoft&amp;hl=en&amp;sa=2" target="_blank">search</a> does not return <i>&#8220;windows&#8221;</i> keywords. Moreover, searching for <a href="http://www.google.com/search?q=msft+redmond&amp;hl=en&amp;sa=2" target="_blank">&#8220;msft redmond&#8221;</a> does not return <i>&#8220;microsoft&#8221;</i> keywords, suggesting that the link between &#8220;Coca-Cola&#8221; and &#8220;CCE&#8221; is not simply based on high correlation of the occurrences of the two words, nor on acronyms.</p>
<p>So, I&#8217;m now left with one possibility only: something&#8217;s going on between Google and Coca-Cola <img src='http://s2.wp.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> </p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/gabrielegiuseppini.wordpress.com/307/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/gabrielegiuseppini.wordpress.com/307/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/gabrielegiuseppini.wordpress.com/307/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/gabrielegiuseppini.wordpress.com/307/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/gabrielegiuseppini.wordpress.com/307/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/gabrielegiuseppini.wordpress.com/307/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/gabrielegiuseppini.wordpress.com/307/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/gabrielegiuseppini.wordpress.com/307/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/gabrielegiuseppini.wordpress.com/307/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/gabrielegiuseppini.wordpress.com/307/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/gabrielegiuseppini.wordpress.com/307/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/gabrielegiuseppini.wordpress.com/307/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/gabrielegiuseppini.wordpress.com/307/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/gabrielegiuseppini.wordpress.com/307/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=gabrielegiuseppini.wordpress.com&amp;blog=3819630&amp;post=307&amp;subd=gabrielegiuseppini&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://gabrielegiuseppini.wordpress.com/2009/12/09/google-synonyms-and-coca-cola/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/e8c21674d4ec19c1f1cb21410c13c293?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">gabrielegiuseppini</media:title>
		</media:content>
	</item>
	</channel>
</rss>
