<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Allen Day's Blog &#187; Analytics</title>
	<atom:link href="http://www.spicylogic.com/allenday/blog/category/science/analytics/feed/" rel="self" type="application/rss+xml" />
	<link>http://www.spicylogic.com/allenday/blog</link>
	<description>♥data♥</description>
	<lastBuildDate>Mon, 21 Jun 2010 23:28:18 +0000</lastBuildDate>
	<generator>http://wordpress.org/?v=2.9.1</generator>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
			<item>
		<title>Taste item-item recommender example</title>
		<link>http://www.spicylogic.com/allenday/blog/2009/02/11/taste-item-item-recommender-example/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2009/02/11/taste-item-item-recommender-example/#comments</comments>
		<pubDate>Wed, 11 Feb 2009 22:10:00 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Java]]></category>
		<category><![CDATA[Mahout]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2009/02/11/taste-item-item-recommender-example/</guid>
		<description><![CDATA[I threw together a Mahout/Taste based item-item based recommender last night.

	public static void itemItemRecommendations&#40;String path, String file&#41; &#123;
		File f = new File&#40;path, file&#41;;
	    try &#123;
			DataModel model = new FileDataModel&#40;f&#41;;
			model.refresh&#40;null&#41;;
		    ItemSimilarity itemSimilarity = new LogLikelihoodSimilarity&#40;model&#41;;
		    ItemBasedRecommender itemRecommender = new GenericItemBasedRecommender&#40;model, itemSimilarity&#41;;
		    for &#40; Item [...]]]></description>
			<content:encoded><![CDATA[<p>I threw together a Mahout/Taste based item-item based recommender last night.</p>

<div class="wp_syntax"><div class="code"><pre class="java">	<span style="color: #000000; font-weight: bold;">public</span> <span style="color: #000000; font-weight: bold;">static</span> <span style="color: #993333;">void</span> itemItemRecommendations<span style="color: #66cc66;">&#40;</span><span style="color: #aaaadd; font-weight: bold;">String</span> path, <span style="color: #aaaadd; font-weight: bold;">String</span> file<span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
		<span style="color: #aaaadd; font-weight: bold;">File</span> f = <span style="color: #000000; font-weight: bold;">new</span> <span style="color: #aaaadd; font-weight: bold;">File</span><span style="color: #66cc66;">&#40;</span>path, file<span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">;</span>
	    <span style="color: #000000; font-weight: bold;">try</span> <span style="color: #66cc66;">&#123;</span>
			DataModel model = <span style="color: #000000; font-weight: bold;">new</span> FileDataModel<span style="color: #66cc66;">&#40;</span>f<span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">;</span>
			model.<span style="color: #006600;">refresh</span><span style="color: #66cc66;">&#40;</span><span style="color: #000000; font-weight: bold;">null</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">;</span>
		    ItemSimilarity itemSimilarity = <span style="color: #000000; font-weight: bold;">new</span> LogLikelihoodSimilarity<span style="color: #66cc66;">&#40;</span>model<span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">;</span>
		    ItemBasedRecommender itemRecommender = <span style="color: #000000; font-weight: bold;">new</span> GenericItemBasedRecommender<span style="color: #66cc66;">&#40;</span>model, itemSimilarity<span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">;</span>
		    <span style="color: #b1b100;">for</span> <span style="color: #66cc66;">&#40;</span> Item i : model.<span style="color: #006600;">getItems</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#41;</span>
			    <span style="color: #b1b100;">for</span> <span style="color: #66cc66;">&#40;</span> RecommendedItem j : itemRecommender.<span style="color: #006600;">mostSimilarItems</span><span style="color: #66cc66;">&#40;</span>i.<span style="color: #006600;">getID</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>, <span style="color: #cc66cc;">50</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#41;</span>
			    	<span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> j.<span style="color: #006600;">getValue</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&gt;</span>= <span style="color: #cc66cc;">0.7</span> <span style="color: #66cc66;">&#41;</span>
			    		<span style="color: #aaaadd; font-weight: bold;">System</span>.<span style="color: #006600;">out</span>.<span style="color: #006600;">println</span><span style="color: #66cc66;">&#40;</span>i.<span style="color: #006600;">getID</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span> + <span style="color: #ff0000;">&quot;<span style="color: #000099; font-weight: bold;">\t</span>&quot;</span> + j.<span style="color: #006600;">getItem</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>.<span style="color: #006600;">getID</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span> + <span style="color: #ff0000;">&quot;<span style="color: #000099; font-weight: bold;">\t</span>&quot;</span> + <span style="color: #aaaadd; font-weight: bold;">String</span>.<span style="color: #006600;">format</span><span style="color: #66cc66;">&#40;</span><span style="color: #ff0000;">&quot;%.3f&quot;</span>, j.<span style="color: #006600;">getValue</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">;</span>
		<span style="color: #66cc66;">&#125;</span> <span style="color: #000000; font-weight: bold;">catch</span> <span style="color: #66cc66;">&#40;</span><span style="color: #aaaadd; font-weight: bold;">FileNotFoundException</span> e<span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
			<span style="color: #808080; font-style: italic;">// TODO Auto-generated catch block</span>
			e.<span style="color: #006600;">printStackTrace</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">;</span>
		<span style="color: #66cc66;">&#125;</span> <span style="color: #000000; font-weight: bold;">catch</span> <span style="color: #66cc66;">&#40;</span>TasteException e<span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
			<span style="color: #808080; font-style: italic;">// TODO Auto-generated catch block</span>
			e.<span style="color: #006600;">printStackTrace</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">;</span>
		<span style="color: #66cc66;">&#125;</span>
	<span style="color: #66cc66;">&#125;</span></pre></div></div>

<p>This outputs item1 &#8211;recommends&#8211;>item2 pairs with a weight.  I&#8217;m taking this and putting it into a solr document so I can display related item2s alongside item1 when it&#8217;s viewed.</p>
<p>Input data are comma-delimited <userID,itemID,score> tuples like so:</p>
<pre>
1fe7401b81eed49353d0cbeba5383848,5212,0.6
3c1832954a6e8781836fed670bb37b24,5212,1
70273e4c7c77700ee97acb8d0306c405,5213,0.8
1f057ccde135acbc881008bbf466e7e1,5213,1
51d44c7baca65ad39d11ba87bf2d438b,5213,1
adc924559b37114cd97d1f5cf7c71419,5213,1
78e254b4a11e61d76ff63cea02de4de8,5213,1
5c373ec7d9ad4a6f392c291d8ccba5ce,5213,0.2
fab8537564094fa8885f6214e6b682e1,5213,1
127f46aabcdbc2d2d04da8398a996c75,5213,1
</pre>
<p>Works great.  Thanks Sean.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2009/02/11/taste-item-item-recommender-example/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
		</item>
		<item>
		<title>Upcoming AI / Machine Learning Conferences</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/12/05/upcoming-ai-machine-learning-conferences/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/12/05/upcoming-ai-machine-learning-conferences/#comments</comments>
		<pubDate>Fri, 05 Dec 2008 19:49:13 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Informatics]]></category>
		<category><![CDATA[Mathematics]]></category>
		<category><![CDATA[Networking]]></category>
		<category><![CDATA[Science]]></category>
		<category><![CDATA[Software]]></category>
		<category><![CDATA[Statistics]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2008/12/05/upcoming-ai-machine-learning-conferences/</guid>
		<description><![CDATA[A (partial) list I found today.  Doesn&#8217;t include NIPS, so I&#8217;m not sure how exhaustive it is, but it has a bunch I haven&#8217;t seen before.
http://www.kmining.com/info_conferences.html
]]></description>
			<content:encoded><![CDATA[<p>A (partial) list I found today.  Doesn&#8217;t include NIPS, so I&#8217;m not sure how exhaustive it is, but it has a bunch I haven&#8217;t seen before.</p>
<p><a href="http://www.kmining.com/info_conferences.html">http://www.kmining.com/info_conferences.html</a></p>
]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/12/05/upcoming-ai-machine-learning-conferences/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>ZIP code demographic data with Perl</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/10/29/zip-code-demographic-data-with-perl/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/10/29/zip-code-demographic-data-with-perl/#comments</comments>
		<pubDate>Wed, 29 Oct 2008 07:45:04 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Perl]]></category>
		<category><![CDATA[Science]]></category>
		<category><![CDATA[Statistics]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2008/10/29/zip-code-demographic-data-with-perl/</guid>
		<description><![CDATA[I needed some demographics data earlier this week and tried using the SF3 files from census.gov&#8217;s &#8220;Census 2000&#8243; data set.
What a time sink.  Ugh.
The methods used are very well documented, and I learned a lot about the census.  What I was not able to learn, however, was how to actually extract the data [...]]]></description>
			<content:encoded><![CDATA[<p>I needed some demographics data earlier this week and tried using the SF3 files from <a href="http://census.gov">census.gov</a>&#8217;s &#8220;Census 2000&#8243; data set.</p>
<p>What a time sink.  Ugh.</p>
<p>The methods used are very well documented, and I learned a lot about the census.  What I was not able to learn, however, was how to <i>actually extract the data</i> from the flat files.  Look at what <a href="http://www.rdfabout.com/demo/census/">Joshua Tauberer</a> went through to get some idea of the pain level.</p>
<p>Finally I got fed up and wrote a screen scraper for <a href="http://zipskinny.com">ZIPskinny.com</a> in Perl.  It&#8217;s one-off crappy code.  You can get it from CPAN under namespace <a href="http://search.cpan.org/~allenday/Geo-Demo-Zipskinny-0.01/">Geo::Demo::Zipskinny</a>.</p>
<p>Hope it saves you some time.  Leave me a comment if you have <i>working code</i> that can deal with SF3 files.</p>
<p>Here&#8217;s a little ZIP code to rich-vs-poor plot I made earlier.<br />
<img src='http://img362.imageshack.us/img362/9338/incomeeu3.png' alt='' width='600' class='alignnone' /></p>
]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/10/29/zip-code-demographic-data-with-perl/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Webserver logs access time by region/language</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/10/22/webserver-logs-access-time-by-regionlanguage/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/10/22/webserver-logs-access-time-by-regionlanguage/#comments</comments>
		<pubDate>Thu, 23 Oct 2008 02:09:39 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Analytics]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[Science]]></category>
		<category><![CDATA[Statistics]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2008/10/22/webserver-logs-access-time-by-regionlanguage/</guid>
		<description><![CDATA[As anyone with a popular website knows, there&#8217;s a big difference in the resources required for peak vs. off-peak hours and you typically have to pay for peak usage even if you don&#8217;t always use it (e.g. 95th percentile bandwidth billing)
Frugal as I am, I was curious to see if I could increase traffic during [...]]]></description>
			<content:encoded><![CDATA[<p>As anyone with a popular website knows, there&#8217;s a big difference in the resources required for peak vs. off-peak hours and you typically have to pay for peak usage even if you don&#8217;t always use it (e.g. 95th percentile bandwidth billing)</p>
<p>Frugal as I am, I was curious to see if I could increase traffic during what are off-peak hours.  Seemed sensible that people in different regions of the world might be accessing during off-hours.</p>
<p>So I aggregated data by country code/language and 10-minute time segment.  Applied a Daniell smoothing kernel (a sliding window) of 6 segments (1 hour) and plotted a a row-scaled heatmap in R.  Rows are clustered so similar access patterns are next to one another, with the left-hand-side dendrogram indicating dissimilarity between rows.  Yellow-white is a traffic burst.  I&#8217;ll post the code and data later for how I made this.</p>
<p><a href="http://img80.imageshack.us/img80/6247/l10naccesslb0.png"><img width="300" src='http://img80.imageshack.us/img80/6247/l10naccesslb0.png' alt='access times by country/language' class='alignnone' /></a></p>
<p>As it turns out, the main off-peak trough corresponds to the middle of the Pacific ocean.  Kinda watery for people to live there.  Oh well, I tried.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/10/22/webserver-logs-access-time-by-regionlanguage/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>aggregate &#8211; report event counts from a stream</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/10/13/aggregate-report-event-counts-from-a-stream/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/10/13/aggregate-report-event-counts-from-a-stream/#comments</comments>
		<pubDate>Mon, 13 Oct 2008 23:05:06 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Administration]]></category>
		<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Perl]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2008/10/13/aggregate-report-event-counts-from-a-stream/</guid>
		<description><![CDATA[Another shell utility.  This one is useful for, e.g. counting 404, 500, 200, 302 HTTP codes from a log file.

#!/usr/bin/perl
$&#124;++;
use strict;
use Getopt::Long;
&#160;
my $mode = 'line';
my $tick = 100;
my $help = undef;
my $keysfile = undef;
my %keys = &#40;&#41;;
&#160;
GetOptions&#40;
  'mode&#124;m=s' =&#62; \$mode,
  'tick&#124;t=i' =&#62; \$tick,
  'help&#124;h'   =&#62; \$help,
  'keys&#124;k=f' =&#62; [...]]]></description>
			<content:encoded><![CDATA[<p>Another shell utility.  This one is useful for, e.g. counting 404, 500, 200, 302 HTTP codes from a log file.</p>

<div class="wp_syntax"><div class="code"><pre class="perl"><span style="color: #808080; font-style: italic;">#!/usr/bin/perl</span>
$<span style="color: #66cc66;">|</span>++;
<span style="color: #000000; font-weight: bold;">use</span> strict;
<span style="color: #000000; font-weight: bold;">use</span> Getopt::<span style="color: #006600;">Long</span>;
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$mode</span> = <span style="color: #ff0000;">'line'</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$tick</span> = <span style="color: #cc66cc;">100</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$help</span> = <span style="color: #000066;">undef</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$keysfile</span> = <span style="color: #000066;">undef</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">%keys</span> = <span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
&nbsp;
GetOptions<span style="color: #66cc66;">&#40;</span>
  <span style="color: #ff0000;">'mode|m=s'</span> =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$mode</span>,
  <span style="color: #ff0000;">'tick|t=i'</span> =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$tick</span>,
  <span style="color: #ff0000;">'help|h'</span>   =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$help</span>,
  <span style="color: #ff0000;">'keys|k=f'</span> =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$keysfile</span>,
<span style="color: #66cc66;">&#41;</span>;
&nbsp;
<span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$help</span> <span style="color: #66cc66;">||</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$mode</span> <span style="color: #b1b100;">ne</span> <span style="color: #ff0000;">'line'</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #0000ff;">$mode</span> <span style="color: #b1b100;">ne</span> <span style="color: #ff0000;">'time'</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">||</span> <span style="color: #0000ff;">$tick</span> <span style="color: #66cc66;">&lt;</span>= <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">||</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #000066;">defined</span><span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$keysfile</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #66cc66;">!</span>-f <span style="color: #0000ff;">$keysfile</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$USAGE</span> = <span style="color: #000066;">join</span> <span style="color: #ff0000;">''</span>, <span style="color: #009999;">&lt;DATA&gt;</span>;
  <span style="color: #000066;">print</span> <span style="color: #000000; font-weight: bold;">STDERR</span> <span style="color: #0000ff;">$USAGE</span> <span style="color: #b1b100;">and</span> <span style="color: #000066;">exit</span><span style="color: #66cc66;">&#40;</span><span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#41;</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
<span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$keysfile</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #000066;">open</span><span style="color: #66cc66;">&#40;</span>K, <span style="color: #0000ff;">$keysfile</span><span style="color: #66cc66;">&#41;</span> <span style="color: #b1b100;">or</span> <span style="color: #000066;">die</span> <span style="color: #ff0000;">&quot;Couldn't open keys file '$keysfile': $!&quot;</span>;
  <span style="color: #b1b100;">while</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$line</span> = <span style="color: #009999;">&lt;K&gt;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #000066;">chomp</span> <span style="color: #0000ff;">$line</span>;
    <span style="color: #0000ff;">$keys</span><span style="color: #66cc66;">&#123;</span> <span style="color: #0000ff;">$line</span> <span style="color: #66cc66;">&#125;</span>++;
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #000066;">close</span><span style="color: #66cc66;">&#40;</span>K<span style="color: #66cc66;">&#41;</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">%count</span> = <span style="color: #0000ff;">%keys</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$offset</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$mark</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$offset</span> = <span style="color: #cc66cc;">0</span>;
&nbsp;
<span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$mode</span> eq <span style="color: #ff0000;">'time'</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #0000ff;">$mark</span> = <span style="color: #000066;">time</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
<span style="color: #b1b100;">while</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$element</span> = <span style="color: #66cc66;">&lt;&gt;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #000066;">chomp</span> <span style="color: #0000ff;">$element</span>;
  <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #000066;">scalar</span><span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">%keys</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #0000ff;">$count</span><span style="color: #66cc66;">&#123;</span> <span style="color: #0000ff;">$element</span> <span style="color: #66cc66;">&#125;</span>++ <span style="color: #b1b100;">if</span> <span style="color: #0000ff;">$keys</span><span style="color: #66cc66;">&#123;</span> <span style="color: #0000ff;">$element</span> <span style="color: #66cc66;">&#125;</span>;
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #b1b100;">else</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #0000ff;">$count</span><span style="color: #66cc66;">&#123;</span> <span style="color: #0000ff;">$element</span> <span style="color: #66cc66;">&#125;</span>++;
  <span style="color: #66cc66;">&#125;</span>
&nbsp;
  <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$mode</span> eq <span style="color: #ff0000;">'line'</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #0000ff;">$offset</span>++;
    <span style="color: #0000ff;">$mark</span>++;
    <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$mark</span> <span style="color: #66cc66;">&gt;</span>= <span style="color: #0000ff;">$tick</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      <span style="color: #0000ff;">$mark</span> = <span style="color: #cc66cc;">0</span>;
      flush<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
    <span style="color: #66cc66;">&#125;</span>
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #b1b100;">elsif</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$mode</span> eq <span style="color: #ff0000;">'time'</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$mark</span> + <span style="color: #0000ff;">$tick</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #000066;">time</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      <span style="color: #0000ff;">$offset</span> = <span style="color: #000066;">time</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
      <span style="color: #0000ff;">$mark</span> = <span style="color: #000066;">time</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
      flush<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
    <span style="color: #66cc66;">&#125;</span>
  <span style="color: #66cc66;">&#125;</span>
<span style="color: #66cc66;">&#125;</span>
flush<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
&nbsp;
<span style="color: #000000; font-weight: bold;">sub</span> flush <span style="color: #66cc66;">&#123;</span>
  <span style="color: #000066;">print</span> <span style="color: #ff0000;">&quot;summary/$tick @ $offset<span style="color: #000099; font-weight: bold;">\n</span>&quot;</span>;
  <span style="color: #b1b100;">foreach</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$k</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #000066;">sort</span> <span style="color: #000066;">keys</span> <span style="color: #0000ff;">%count</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #000066;">print</span> <span style="color: #ff0000;">&quot;<span style="color: #000099; font-weight: bold;">\t</span>&quot;</span>, <span style="color: #0000ff;">$count</span><span style="color: #66cc66;">&#123;</span> <span style="color: #0000ff;">$k</span> <span style="color: #66cc66;">&#125;</span>, <span style="color: #ff0000;">&quot;<span style="color: #000099; font-weight: bold;">\t</span>&quot;</span>, <span style="color: #0000ff;">$k</span>, <span style="color: #ff0000;">&quot;<span style="color: #000099; font-weight: bold;">\n</span>&quot;</span>;
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #0000ff;">%count</span> = <span style="color: #0000ff;">%keys</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
<span style="color: #000000; font-weight: bold;">__DATA__</span>
Usage: aggregate <span style="color: #66cc66;">&#91;</span>-h<span style="color: #66cc66;">&#93;</span> <span style="color: #66cc66;">&#91;</span>-<span style="color: #000066;">m</span> <span style="color: #66cc66;">&lt;</span>time<span style="color: #66cc66;">|</span>line<span style="color: #66cc66;">&gt;</span><span style="color: #66cc66;">&#93;</span> <span style="color: #66cc66;">&#91;</span>-t <span style="color: #66cc66;">&lt;</span><span style="color: #808080; font-style: italic;"># of seconds or lines&gt;] [-k &lt;keys file&gt;]</span>
&nbsp;
Read lines from <span style="color: #000000; font-weight: bold;">STDIN</span>.  Print lines by frequency per input lines <span style="color: #b1b100;">or</span> <span style="color: #000066;">time</span>.
&nbsp;
  -h    show help <span style="color: #66cc66;">&#40;</span>this message<span style="color: #66cc66;">&#41;</span>
  -<span style="color: #000066;">m</span>    mode.  one of <span style="color: #ff0000;">'time'</span> <span style="color: #b1b100;">or</span> <span style="color: #ff0000;">'line'</span>.  defaults to <span style="color: #ff0000;">'line'</span>.
  -t    aggregation size.  an integer.  value is <span style="color: #808080; font-style: italic;"># of lines ('line' mode) or # of</span>
        seconds <span style="color: #66cc66;">&#40;</span><span style="color: #ff0000;">'time'</span> mode<span style="color: #66cc66;">&#41;</span> after which an aggregation is triggered.  defaults to <span style="color: #cc66cc;">100</span>.
  -k    <span style="color: #000066;">keys</span> file.  a text file of strings to <span style="color: #66cc66;">*</span>exactly<span style="color: #66cc66;">*</span> match in the input, one per line.
        <span style="color: #b1b100;">if</span> a <span style="color: #000066;">keys</span> file is provided, lines <span style="color: #b1b100;">not</span> present in the <span style="color: #000066;">keys</span> file will be silently
        ignored.</pre></div></div>

]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/10/13/aggregate-report-event-counts-from-a-stream/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>shuffle &#8211; randomize a stream of data</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/10/10/shuffle-randomize-a-stream-of-data/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/10/10/shuffle-randomize-a-stream-of-data/#comments</comments>
		<pubDate>Fri, 10 Oct 2008 18:20:47 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Administration]]></category>
		<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Perl]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2008/10/10/shuffle-randomize-a-stream-of-data/</guid>
		<description><![CDATA[Here&#8217;s another little shell utility I&#8217;ve been sitting on for a while.  This one shuffles the line-oriented data read from a pipe.  It has the notion of buffering and partial flushing so we can handle streams / very large data sets.

#!/usr/bin/perl
$&#124;++;
use strict;
use Getopt::Long;
&#160;
my $USAGE = join '', &#60;DATA&#62;;
&#160;
my $B = 0;
my $D = [...]]]></description>
			<content:encoded><![CDATA[<p>Here&#8217;s another little shell utility I&#8217;ve been sitting on for a while.  This one shuffles the line-oriented data read from a pipe.  It has the notion of buffering and partial flushing so we can handle streams / very large data sets.</p>

<div class="wp_syntax"><div class="code"><pre class="perl"><span style="color: #808080; font-style: italic;">#!/usr/bin/perl</span>
$<span style="color: #66cc66;">|</span>++;
<span style="color: #000000; font-weight: bold;">use</span> strict;
<span style="color: #000000; font-weight: bold;">use</span> Getopt::<span style="color: #006600;">Long</span>;
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$USAGE</span> = <span style="color: #000066;">join</span> <span style="color: #ff0000;">''</span>, <span style="color: #009999;">&lt;DATA&gt;</span>;
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$B</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$D</span> = <span style="color: #cc66cc;">1</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$H</span> = <span style="color: #cc66cc;">0</span>;
&nbsp;
GetOptions <span style="color: #66cc66;">&#40;</span><span style="color: #ff0000;">&quot;buffer|b=i&quot;</span>   =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$B</span>,
            <span style="color: #ff0000;">&quot;draw|d=i&quot;</span>     =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$D</span>,
            <span style="color: #ff0000;">&quot;help|h&quot;</span>       =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$H</span>,
           <span style="color: #66cc66;">&#41;</span>; 
&nbsp;
<span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$D</span> == <span style="color: #cc66cc;">1</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #0000ff;">$B</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #0000ff;">$D</span> = <span style="color: #0000ff;">$B</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
<span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$B</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #cc66cc;">0</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">||</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$D</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">||</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$B</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #0000ff;">$D</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">||</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$H</span><span style="color: #66cc66;">&#41;</span>
<span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #000066;">print</span> <span style="color: #0000ff;">$USAGE</span> <span style="color: #b1b100;">and</span> <span style="color: #000066;">exit</span><span style="color: #66cc66;">&#40;</span><span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#41;</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">@buf</span> = <span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
&nbsp;
<span style="color: #b1b100;">while</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$element</span> = <span style="color: #66cc66;">&lt;&gt;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #808080; font-style: italic;">#buffer whole stream</span>
  <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$B</span> == <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #000066;">push</span> <span style="color: #0000ff;">@buf</span>, <span style="color: #0000ff;">$element</span>;
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #808080; font-style: italic;">#no-op</span>
  <span style="color: #b1b100;">elsif</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$B</span> == <span style="color: #cc66cc;">1</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #000066;">print</span> <span style="color: #0000ff;">$element</span>;
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #808080; font-style: italic;">#buffer window</span>
  <span style="color: #b1b100;">else</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #000066;">push</span> <span style="color: #0000ff;">@buf</span>, <span style="color: #0000ff;">$element</span>;
    <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #000066;">scalar</span><span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">@buf</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&gt;</span>= <span style="color: #0000ff;">$D</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #000066;">scalar</span><span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">@buf</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #0000ff;">$B</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      flush<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
    <span style="color: #66cc66;">&#125;</span>
  <span style="color: #66cc66;">&#125;</span>
<span style="color: #66cc66;">&#125;</span>
flush<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
&nbsp;
<span style="color: #000000; font-weight: bold;">sub</span> flush <span style="color: #66cc66;">&#123;</span>
  <span style="color: #b1b100;">for</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$j</span> = <span style="color: #000066;">scalar</span><span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">@buf</span> <span style="color: #66cc66;">&#41;</span> - <span style="color: #cc66cc;">1</span> ; <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&gt;</span>= <span style="color: #cc66cc;">0</span> ; <span style="color: #0000ff;">$j</span>-- <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$swap</span> = <span style="color: #000066;">int</span><span style="color: #66cc66;">&#40;</span><span style="color: #000066;">rand</span><span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$j</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">&#41;</span>;
    <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$swap</span> <span style="color: #66cc66;">!</span>= <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&#93;</span>, <span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$swap</span> <span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#41;</span> = <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$swap</span> <span style="color: #66cc66;">&#93;</span>, <span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#41;</span>;
    <span style="color: #66cc66;">&#125;</span>
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #b1b100;">while</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #000066;">scalar</span><span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">@buf</span> <span style="color: #66cc66;">&#41;</span> - <span style="color: #cc66cc;">1</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #0000ff;">$B</span> - <span style="color: #0000ff;">$D</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #000066;">print</span> <span style="color: #000066;">shift</span> <span style="color: #0000ff;">@buf</span>;
  <span style="color: #66cc66;">&#125;</span>
<span style="color: #66cc66;">&#125;</span>
&nbsp;
&nbsp;
<span style="color: #000000; font-weight: bold;">__DATA__</span>
Usage: shuffle <span style="color: #66cc66;">&#91;</span>-h<span style="color: #66cc66;">&#93;</span> <span style="color: #66cc66;">&#91;</span>-b <span style="color: #66cc66;">&lt;</span>buffer size<span style="color: #66cc66;">&gt;</span><span style="color: #66cc66;">&#93;</span> <span style="color: #66cc66;">&#91;</span>-d <span style="color: #66cc66;">&lt;</span>draw size<span style="color: #66cc66;">&gt;</span><span style="color: #66cc66;">&#93;</span>
&nbsp;
Shuffle lines from a stream on <span style="color: #000000; font-weight: bold;">STDIN</span>.  Write lines to <span style="color: #000000; font-weight: bold;">STDOUT</span>.
&nbsp;
  -h    show help <span style="color: #66cc66;">&#40;</span>this message<span style="color: #66cc66;">&#41;</span>
  -b    buffer size
        <span style="color: #66cc66;">&#40;</span>default <span style="color: #cc66cc;">0</span>.  indicates shuffle whole stream, <span style="color: #b1b100;">then</span> <span style="color: #000066;">write</span><span style="color: #66cc66;">&#41;</span>
        range: <span style="color: #cc66cc;">1</span>..
  -d    draw size
        <span style="color: #66cc66;">&#40;</span>defaults to value of -b.  number of items to remove from the
        buffer when it fills<span style="color: #66cc66;">&#41;</span>
        range: <span style="color: #cc66cc;">1</span>..buffer size
&nbsp;
You have to parameters available <span style="color: #66cc66;">&#40;</span>besides -h <span style="color: #b1b100;">for</span> help<span style="color: #66cc66;">&#41;</span>.
&nbsp;
<span style="color: #66cc66;">*</span> buffer size <span style="color: #66cc66;">&#40;</span>-b<span style="color: #66cc66;">&#41;</span>.  Determines how many elements to temporarily hold
before shuffling.  The advantage of this buffer is to allow shuffling on
very long streams that would <span style="color: #b1b100;">not</span> fit into <span style="color: #000066;">system</span> memory.  The
disadavantage is that it is <span style="color: #b1b100;">not</span> a truly random shuffle, as <span style="color: #000066;">each</span> input
element can appear at most buffer-size positions away from the original
position.  Buffer size defaults to zero, so make sure to set it <span style="color: #b1b100;">if</span> your
data set size is large.
&nbsp;
<span style="color: #66cc66;">*</span> draw size <span style="color: #66cc66;">&#40;</span>-d<span style="color: #66cc66;">&#41;</span>.  Determines how frequently the buffer is shuffled <span style="color: #b1b100;">and</span>
flushed.  Rather than shuffling<span style="color: #66cc66;">/</span>flushing all elements in the buffer, only
<span style="color: #b1b100;">do</span> D elements.  The advantage here is elements can appear more than
buffer-size positions away from the original position.  The disadvantage
is that shuffling is done B<span style="color: #66cc66;">/</span>D <span style="color: #000066;">times</span> more frequently.  Draw size defaults
to buffer size, <span style="color: #b1b100;">and</span> has <span style="color: #000066;">no</span> effect.  Set it to <span style="color: #cc66cc;">1</span> to maximize randomness.
&nbsp;
Copyright<span style="color: #66cc66;">/</span>License:
&nbsp;
  Allen Day <span style="color: #66cc66;">&lt;</span>allenday<span style="color: #0000ff;">@ucla</span>.edu<span style="color: #66cc66;">&gt;</span>, licensed under GPL <span style="color: #cc66cc;">2006</span><span style="color: #cc66cc;">-2008</span></pre></div></div>

]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/10/10/shuffle-randomize-a-stream-of-data/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>sample &#8211; probabilistic sampling from a stream of lines</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/10/09/sample-probabilistic-sampling-from-a-stream-of-lines/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/10/09/sample-probabilistic-sampling-from-a-stream-of-lines/#comments</comments>
		<pubDate>Fri, 10 Oct 2008 01:33:31 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Administration]]></category>
		<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Perl]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2008/10/09/sample-probabilistic-sampling-from-a-stream-of-lines/</guid>
		<description><![CDATA[I&#8217;m frequently monitoring webservers, cache servers, database servers, etc by tailing their log files.  See my previous post on making logs easier to monitor by color.
Sometimes you also have too much data, and you don&#8217;t want to look at all of it.  Use this to sample.
sample source:

#!/usr/bin/perl
$&#124;++;
use strict;
use Getopt::Long;
&#160;
my $USAGE = join '', [...]]]></description>
			<content:encoded><![CDATA[<p>I&#8217;m frequently monitoring webservers, cache servers, database servers, etc by tailing their log files.  See my previous post on <a href="/blog/2008/07/05/pcoc-piped-command-output-colorizer/">making logs easier to monitor by color</a>.</p>
<p>Sometimes you also have too much data, and you don&#8217;t want to look at all of it.  Use this to sample.</p>
<p>sample source:</p>

<div class="wp_syntax"><div class="code"><pre class="perl"><span style="color: #808080; font-style: italic;">#!/usr/bin/perl</span>
$<span style="color: #66cc66;">|</span>++;
<span style="color: #000000; font-weight: bold;">use</span> strict;
<span style="color: #000000; font-weight: bold;">use</span> Getopt::<span style="color: #006600;">Long</span>;
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$USAGE</span> = <span style="color: #000066;">join</span> <span style="color: #ff0000;">''</span>, <span style="color: #009999;">&lt;DATA&gt;</span>;
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$T</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$K</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$P</span> = <span style="color: #cc66cc;">1</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$H</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$N</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$S</span> = <span style="color: #cc66cc;">0</span>;
&nbsp;
GetOptions <span style="color: #66cc66;">&#40;</span><span style="color: #ff0000;">&quot;time|t=i&quot;</span>     =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$T</span>,
            <span style="color: #ff0000;">&quot;number|n=i&quot;</span>   =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$N</span>,
            <span style="color: #ff0000;">&quot;count|k=i&quot;</span>    =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$K</span>,
            <span style="color: #ff0000;">&quot;prob|p=f&quot;</span>     =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$P</span>,
            <span style="color: #ff0000;">&quot;shuffle|s&quot;</span>    =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$S</span>,
            <span style="color: #ff0000;">&quot;help|h&quot;</span>       =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">$H</span>,
           <span style="color: #66cc66;">&#41;</span>; 
&nbsp;
<span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$T</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #0000ff;">$P</span> <span style="color: #66cc66;">!</span>= <span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">||</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$K</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #0000ff;">$P</span> <span style="color: #66cc66;">!</span>= <span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">||</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$K</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">||</span> <span style="color: #0000ff;">$P</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">||</span> <span style="color: #0000ff;">$T</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">||</span> <span style="color: #0000ff;">$N</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">||</span> <span style="color: #0000ff;">$P</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">1</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">||</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$T</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #0000ff;">$N</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">0</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">||</span>
  <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$H</span><span style="color: #66cc66;">&#41;</span>
<span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #000066;">print</span> <span style="color: #0000ff;">$USAGE</span> <span style="color: #b1b100;">and</span> <span style="color: #000066;">exit</span><span style="color: #66cc66;">&#40;</span><span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#41;</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$position</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">@buf</span> = <span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$before</span> = <span style="color: #000066;">time</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
&nbsp;
<span style="color: #b1b100;">while</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$element</span> = <span style="color: #66cc66;">&lt;&gt;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #808080; font-style: italic;"># sample full stream, report at the end</span>
  <span style="color: #808080; font-style: italic;"># sample K elements every T seconds</span>
  <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$K</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #000066;">scalar</span><span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">@buf</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #0000ff;">$K</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      <span style="color: #000066;">push</span> <span style="color: #0000ff;">@buf</span>, <span style="color: #66cc66;">&#91;</span><span style="color: #0000ff;">$position</span>, <span style="color: #0000ff;">$element</span><span style="color: #66cc66;">&#93;</span>;
    <span style="color: #66cc66;">&#125;</span>
    <span style="color: #b1b100;">elsif</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$K</span><span style="color: #66cc66;">/</span><span style="color: #0000ff;">$position</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #000066;">rand</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$index</span> = <span style="color: #000066;">int</span><span style="color: #66cc66;">&#40;</span><span style="color: #000066;">rand</span><span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$K</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">&#41;</span>;
      <span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$index</span> <span style="color: #66cc66;">&#93;</span> = <span style="color: #66cc66;">&#91;</span><span style="color: #0000ff;">$position</span>, <span style="color: #0000ff;">$element</span><span style="color: #66cc66;">&#93;</span>; <span style="color: #808080; font-style: italic;">#save position for sort</span>
    <span style="color: #66cc66;">&#125;</span>
    <span style="color: #808080; font-style: italic;">#time-based K-sampling</span>
    <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$T</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #000066;">time</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #0000ff;">$before</span> + <span style="color: #0000ff;">$T</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      flush<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
    <span style="color: #66cc66;">&#125;</span>
    <span style="color: #808080; font-style: italic;">#event-based K-sampling</span>
    <span style="color: #b1b100;">elsif</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$N</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #0000ff;">$position</span> <span style="color: #66cc66;">&gt;</span> <span style="color: #0000ff;">$N</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      flush<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
    <span style="color: #66cc66;">&#125;</span>
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #808080; font-style: italic;"># sample with probability</span>
  <span style="color: #b1b100;">elsif</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$P</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #cc66cc;">1</span> <span style="color: #66cc66;">&amp;&amp;</span> <span style="color: #000066;">rand</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&lt;</span> <span style="color: #0000ff;">$P</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #000066;">print</span> <span style="color: #0000ff;">$element</span>;
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #0000ff;">$position</span>++;
<span style="color: #66cc66;">&#125;</span>
flush<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
&nbsp;
<span style="color: #000000; font-weight: bold;">sub</span> flush <span style="color: #66cc66;">&#123;</span>
  <span style="color: #0000ff;">$before</span> = <span style="color: #000066;">time</span><span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
  <span style="color: #808080; font-style: italic;">#Knuth shuffle</span>
  <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$S</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #b1b100;">for</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$j</span> = <span style="color: #000066;">scalar</span><span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">@buf</span> <span style="color: #66cc66;">&#41;</span> - <span style="color: #cc66cc;">1</span> ; <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&gt;</span>= <span style="color: #cc66cc;">0</span> ; <span style="color: #0000ff;">$j</span>-- <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$swap</span> = <span style="color: #000066;">int</span><span style="color: #66cc66;">&#40;</span><span style="color: #000066;">rand</span><span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$j</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">&#41;</span>;
      <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$swap</span> <span style="color: #66cc66;">!</span>= <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
        <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&#93;</span>, <span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$swap</span> <span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#41;</span> = <span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$swap</span> <span style="color: #66cc66;">&#93;</span>, <span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#41;</span>;
      <span style="color: #66cc66;">&#125;</span>
      <span style="color: #000066;">print</span> <span style="color: #0000ff;">$buf</span><span style="color: #66cc66;">&#91;</span> <span style="color: #0000ff;">$j</span> <span style="color: #66cc66;">&#93;</span>-<span style="color: #66cc66;">&gt;</span><span style="color: #66cc66;">&#91;</span> <span style="color: #cc66cc;">1</span> <span style="color: #66cc66;">&#93;</span>;
    <span style="color: #66cc66;">&#125;</span>
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #b1b100;">else</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #b1b100;">foreach</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$b</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #000066;">sort</span> <span style="color: #66cc66;">&#123;</span><span style="color: #0000ff;">$a</span>-<span style="color: #66cc66;">&gt;</span><span style="color: #66cc66;">&#91;</span><span style="color: #cc66cc;">0</span><span style="color: #66cc66;">&#93;</span> <span style="color: #66cc66;">&lt;</span>=<span style="color: #66cc66;">&gt;</span> <span style="color: #0000ff;">$b</span>-<span style="color: #66cc66;">&gt;</span><span style="color: #66cc66;">&#91;</span><span style="color: #cc66cc;">0</span><span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#125;</span> <span style="color: #0000ff;">@buf</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      <span style="color: #000066;">print</span> <span style="color: #0000ff;">$b</span>-<span style="color: #66cc66;">&gt;</span><span style="color: #66cc66;">&#91;</span><span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#93;</span>;
    <span style="color: #66cc66;">&#125;</span>
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #0000ff;">@buf</span> = <span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
  <span style="color: #0000ff;">$position</span> = <span style="color: #cc66cc;">0</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
&nbsp;
<span style="color: #000000; font-weight: bold;">__DATA__</span>
Usage: sample -<span style="color: #66cc66;">&#91;</span><span style="color: #66cc66;">&#91;</span>h<span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#91;</span>p<span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#91;</span>t<span style="color: #66cc66;">&#91;</span>k<span style="color: #66cc66;">&#91;</span>n<span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#93;</span><span style="color: #66cc66;">&#93;</span>
&nbsp;
Sample lines from a stream on <span style="color: #000000; font-weight: bold;">STDIN</span>.  Write lines to <span style="color: #000000; font-weight: bold;">STDOUT</span>.
&nbsp;
  -h    show help <span style="color: #66cc66;">&#40;</span>this message<span style="color: #66cc66;">&#41;</span>
  -k    sample K elements from stream
        <span style="color: #66cc66;">&#40;</span>default <span style="color: #cc66cc;">0</span><span style="color: #66cc66;">&#41;</span>
        range: <span style="color: #cc66cc;">0</span>..
  -p    sample elements from stream with probability
        <span style="color: #66cc66;">&#40;</span>default <span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#41;</span>
        range: <span style="color: #cc66cc;">0</span> <span style="color: #66cc66;">&lt;</span>= p <span style="color: #66cc66;">&lt;</span>= <span style="color: #cc66cc;">1</span>
  -n    sample over windows of N elements
        <span style="color: #66cc66;">&#40;</span>default <span style="color: #cc66cc;">0</span><span style="color: #66cc66;">&#41;</span>
        range: <span style="color: #cc66cc;">0</span>..
  -t    sample over windows of T seconds
        <span style="color: #66cc66;">&#40;</span>default <span style="color: #cc66cc;">0</span>, instantaneous with -p, infinity with -k<span style="color: #66cc66;">&#41;</span>
        range: <span style="color: #cc66cc;">0</span>..
  -<span style="color: #000066;">s</span>    shuffle outputs
        <span style="color: #66cc66;">&#40;</span>default false<span style="color: #66cc66;">&#41;</span>
&nbsp;
There are two modes of sampling:
&nbsp;
  <span style="color: #66cc66;">*</span> sample with probability <span style="color: #66cc66;">&#40;</span>-p<span style="color: #66cc66;">&#41;</span>
  <span style="color: #66cc66;">*</span> sample a fixed number of elements <span style="color: #66cc66;">&#40;</span>-k<span style="color: #66cc66;">&#41;</span>
&nbsp;
Both modes sample over a given <span style="color: #000066;">time</span> interval in seconds <span style="color: #66cc66;">&#40;</span>-t<span style="color: #66cc66;">&#41;</span>.
-t defaults to zero <span style="color: #66cc66;">&#40;</span>process full stream<span style="color: #66cc66;">&#41;</span>.  -p can only be
used alone.  -n can only be used with -k
&nbsp;
Examples:
&nbsp;
  <span style="color: #66cc66;">*</span> sample K elements from a stream:
    cat <span style="color: #66cc66;">/</span>etc<span style="color: #66cc66;">/</span>passwd <span style="color: #66cc66;">|</span> sample -k <span style="color: #cc66cc;">5</span>
&nbsp;
  <span style="color: #66cc66;">*</span> sample <span style="color: #cc66cc;">1</span><span style="color: #66cc66;">%</span> of elements from a stream:
    tail -f <span style="color: #66cc66;">/</span>var<span style="color: #66cc66;">/</span>logs<span style="color: #66cc66;">/</span>httpd<span style="color: #66cc66;">/</span>access_log <span style="color: #66cc66;">|</span> sample -p <span style="color: #cc66cc;">0.01</span>
&nbsp;
  <span style="color: #66cc66;">*</span> sample K elements from a stream every <span style="color: #cc66cc;">30</span> seconds:
    tail -f <span style="color: #66cc66;">/</span>var<span style="color: #66cc66;">/</span>logs<span style="color: #66cc66;">/</span>httpd<span style="color: #66cc66;">/</span>access_log <span style="color: #66cc66;">|</span> sample -k <span style="color: #cc66cc;">5</span> -t <span style="color: #cc66cc;">30</span>
&nbsp;
  <span style="color: #66cc66;">*</span> sample K elements from a stream every <span style="color: #cc66cc;">30</span> seconds, shuffled:
    tail -f <span style="color: #66cc66;">/</span>var<span style="color: #66cc66;">/</span>logs<span style="color: #66cc66;">/</span>httpd<span style="color: #66cc66;">/</span>access_log <span style="color: #66cc66;">|</span> sample -k <span style="color: #cc66cc;">5</span> -t <span style="color: #cc66cc;">30</span> -<span style="color: #000066;">s</span>
&nbsp;
  <span style="color: #66cc66;">*</span> sample K elements from a stream every <span style="color: #cc66cc;">100</span> elements:
    tail -f <span style="color: #66cc66;">/</span>var<span style="color: #66cc66;">/</span>logs<span style="color: #66cc66;">/</span>httpd<span style="color: #66cc66;">/</span>access_log <span style="color: #66cc66;">|</span> sample -k <span style="color: #cc66cc;">5</span> -n <span style="color: #cc66cc;">100</span>
&nbsp;
Copyright<span style="color: #66cc66;">/</span>License:
&nbsp;
  Allen Day <span style="color: #66cc66;">&lt;</span>allenday<span style="color: #0000ff;">@ucla</span>.edu<span style="color: #66cc66;">&gt;</span>, licensed under GPL <span style="color: #cc66cc;">2006</span><span style="color: #cc66cc;">-2008</span></pre></div></div>

]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/10/09/sample-probabilistic-sampling-from-a-stream-of-lines/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Examples for data import, export, and transport with HBase</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/08/20/examples-for-data-import-export-and-transport-with-hbase/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/08/20/examples-for-data-import-export-and-transport-with-hbase/#comments</comments>
		<pubDate>Thu, 21 Aug 2008 02:23:20 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Distributed Systems]]></category>
		<category><![CDATA[Java]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2008/08/20/examples-for-data-import-export-and-transport-with-hbase/</guid>
		<description><![CDATA[I&#8217;m in the process of setting up an analytic workflow at BiggerBoat.  It&#8217;s looking like the main theme in data structures around here will be the sparse matrix.  So I&#8217;ve been playing with opensource technologies for sparse matrices.  Apache Hadoop&#8217;s HBase is looking like a good choice for now, maybe Hive later.
Right [...]]]></description>
			<content:encoded><![CDATA[<p>I&#8217;m in the process of setting up an analytic workflow at <a href="http://biggerboat.com">BiggerBoat</a>.  It&#8217;s looking like the main theme in data structures around here will be the sparse matrix.  So I&#8217;ve been playing with opensource technologies for sparse matrices.  Apache Hadoop&#8217;s <a href="http://hadoop.apache.org/hbase/">HBase</a> is looking like a good choice for now, maybe <a href="http://www.new.facebook.com/note.php?note_id=16121578919">Hive</a> later.</p>
<p>Right now I&#8217;m getting familiar with the former.  As part of this, I&#8217;m improving the docs on the wiki to make them more user- (as opposed to core developer-) friendly.  My documentation goal right now is to add some data transformation example code.  There are already lots of hadoop examples for doing text -&#038;gt text mapping, e.g. grep, cat, etc.  For HBase not so much.  I.e.</p>
<ul>
<li>text to text (done, many examples</li>
<li>flatfile to HBase table (Bulk loader in the HBase wiki, I haven&#8217;t tried it yet)</li>
<li>HBase table to flatfile</li>
<li>HBase table to HBase table</li>
</ul>
<p>I&#8217;ll be adding updated, complete, and simple code for the latter two (three?) in the next few days to the <a href="http://wiki.apache.org/hadoop/Hbase/MapReduce">HBase/MapReduce</a> page.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/08/20/examples-for-data-import-export-and-transport-with-hbase/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
		</item>
		<item>
		<title>Cookies, IP Addresses and Unique Users</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/08/15/cookies-ip-addresses-and-unique-users/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/08/15/cookies-ip-addresses-and-unique-users/#comments</comments>
		<pubDate>Fri, 15 Aug 2008 21:03:11 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Random musings]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/?p=48</guid>
		<description><![CDATA[I&#8217;ve been thinking about how to track unique users today.  These are my so-far-unorganized thoughts.  Please comment!
You can&#8217;t track users by cookie alone for a couple of reasons: 1) they might use multiple computers, 2) they might delete cookies, 3) multiple users might share the same computer (same account=same cookie)
You also can&#8217;t track [...]]]></description>
			<content:encoded><![CDATA[<p>I&#8217;ve been thinking about how to track unique users today.  These are my so-far-unorganized thoughts.  Please comment!</p>
<p>You can&#8217;t track users by cookie alone for a couple of reasons: 1) they might use multiple computers, 2) they might delete cookies, 3) multiple users might share the same computer (same account=same cookie)</p>
<p>You also can&#8217;t track users by IP address alone for some more reasons: 1) they might be using a mobile device or portable computer that moves from IP to IP, 2) there could be multiple machines passing through a single gateway IP (i.e. LAN NAT).</p>
<p>However, if you combine the cookie/IP information together, you can start to address some of these issues.  Let&#8217;s assume you have some webserver logs that minimally contain <code>&lt;IP address "A"&gt;, &lt;cookie ID "C"&gt;, &lt;timestamp T&gt;</code> triples.</p>
<pre style="padding:2em; background-color:lightgrey">
            === time ===>
PATTERN 1:
C1-A1 ---      ---
C1-A2    ---         ---
C1-A3       ---   ---

PATTERN 2:
C1-A1 ------
C2-A1       ------
C1-A2             ------

PATTERN 3:
C1-A1 -----!
C2-A1       -----!
C3-A1             ------

PATTERN 4:
C1-A1 ------      ------
C2-A1     ----------

PATTERN 5:
C1-A1 -----     -----
C2-A1      -----     ---
</pre>
<p>This matrix indicates the compatibility of each of the patterns (P1-P5)<br />
with several different classes of cookie/IP address combination that we<br />
might want to detect.</p>
<pre style="padding:2em; background-color:lightgrey">
                                     patterns
                               P1  P2  P3  P4  P5
profiles                      --------------------
multiple users per IP        | -   +   -   +   +
multiple users per cookie    | -   -   -   -   -
multiple IPs per user        | +   +   -   -   -
multiple cookies per user    | -   -   +   -   +
cookie deletion              | -   -   +   -   -
"permanent" IP change        | -   +   -   -   -
</pre>
<p>Note that none of these patterns gives any indication for the &#8220;multiple<br />
users per cookie&#8221; profile.  To assess if there is more than one<br />
user/cookie, you might want to look at the context in which you&#8217;re<br />
observing the cookie.  Consider attributes like (timezone corrected)<br />
time-of-day, day-of-week, type of content being viewed.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/08/15/cookies-ip-addresses-and-unique-users/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
		</item>
		<item>
		<title>pcoc &#8211; Piped Command Output Colorizer</title>
		<link>http://www.spicylogic.com/allenday/blog/2008/07/05/pcoc-piped-command-output-colorizer/</link>
		<comments>http://www.spicylogic.com/allenday/blog/2008/07/05/pcoc-piped-command-output-colorizer/#comments</comments>
		<pubDate>Sun, 06 Jul 2008 01:37:09 +0000</pubDate>
		<dc:creator>allenday</dc:creator>
				<category><![CDATA[Administration]]></category>
		<category><![CDATA[Analytics]]></category>
		<category><![CDATA[Perl]]></category>

		<guid isPermaLink="false">http://www.spicylogic.com/allenday/blog/2008/07/05/pcoc-piped-command-output-colorizer/</guid>
		<description><![CDATA[I&#8217;m frequently monitoring webservers, cache servers, database servers, etc by tailing their log files, e.g.

tail -f /etc/httpd/logs/access_log

I like the &#8211;color option provided by grep, but found it to be too limited (only one allowed, no wildcard support).  After a bit of searching to see if a tool existed for doing arbitrary colorizing, I found
acoc, [...]]]></description>
			<content:encoded><![CDATA[<p>I&#8217;m frequently monitoring webservers, cache servers, database servers, etc by tailing their log files, e.g.</p>

<div class="wp_syntax"><div class="code"><pre class="bash"><span style="color: #c20cb9; font-weight: bold;">tail</span> -f <span style="color: #000000; font-weight: bold;">/</span>etc<span style="color: #000000; font-weight: bold;">/</span>httpd<span style="color: #000000; font-weight: bold;">/</span>logs<span style="color: #000000; font-weight: bold;">/</span>access_log</pre></div></div>

<p>I like the &#8211;color option provided by grep, but found it to be too limited (only one allowed, no wildcard support).  After a bit of searching to see if a tool existed for doing arbitrary colorizing, I found<br />
<a href="http://www.caliban.org/ruby/acoc.shtml">acoc, the Arbitrary Command Output Colourer</a>.</p>
<p>&#8230;which almost did what I needed, but couldn&#8217;t read from a pipe.  So I wrote pcoc, the Piped Command Output Colorizer.  I&#8217;m only publishing this because I&#8217;ve been using it for about 1 1/2 years, and still find it useful.</p>
<p>Source code at the end of this post.  Here&#8217;s an example that highlights iPhone/iPod user agents and requests with a 500/400/404 HTTP response:</p>

<div class="wp_syntax"><div class="code"><pre class="bash"><span style="color: #c20cb9; font-weight: bold;">tail</span> -f .<span style="color: #000000; font-weight: bold;">/</span>logs<span style="color: #000000; font-weight: bold;">/</span>access_log <span style="color: #000000; font-weight: bold;">|</span> pcoc -f <span style="color: #ff0000;">'(iPod)=bold cyan'</span> -f <span style="color: #ff0000;">'(iPhone)=bold magenta'</span> -f <span style="color: #ff0000;">'<span style="color: #000099; font-weight: bold;">\b</span>(500|404|400)<span style="color: #000099; font-weight: bold;">\b</span>=red on_black'</span></pre></div></div>

<p>Sorry, no screenshots <img src='http://www.spicylogic.com/allenday/blog/wp-includes/images/smilies/icon_sad.gif' alt=':(' class='wp-smiley' /> .</p>
<p>pcoc source:</p>

<div class="wp_syntax"><div class="code"><pre class="perl"><span style="color: #808080; font-style: italic;">#!/usr/bin/perl</span>
<span style="color: #000000; font-weight: bold;">use</span> strict;
<span style="color: #000000; font-weight: bold;">use</span> Getopt::<span style="color: #006600;">Long</span>;
<span style="color: #000000; font-weight: bold;">use</span> Term::<span style="color: #006600;">ANSIColor</span> <span style="color: #000066;">qw</span><span style="color: #66cc66;">&#40;</span>colored<span style="color: #66cc66;">&#41;</span>;
$<span style="color: #66cc66;">|</span>++;
&nbsp;
<span style="color: #b1b100;">my</span> <span style="color: #0000ff;">%format</span> = <span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#41;</span>;
GetOptions<span style="color: #66cc66;">&#40;</span> <span style="color: #ff0000;">&quot;format|f=s&quot;</span> =<span style="color: #66cc66;">&gt;</span> \<span style="color: #0000ff;">%format</span><span style="color: #66cc66;">&#41;</span>;
&nbsp;
<span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #66cc66;">!</span> <span style="color: #000066;">keys</span> <span style="color: #0000ff;">%format</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #000066;">print</span> <span style="color: #66cc66;">&lt;&lt;</span><span style="color: #ff0000;">&quot;EOF&quot;</span>;
Synopsis:
        pcoc - Piped Command Output Colorizer.  Inspired by acoc.
&nbsp;
Usage:
&nbsp;
        $<span style="color: #cc66cc;">0</span> -f <span style="color: #ff0000;">'&lt;regex1&gt;=&lt;color1&gt;'</span> -f <span style="color: #ff0000;">'&lt;regex2&gt;=&lt;color2&gt;'</span>
&nbsp;
$<span style="color: #cc66cc;">0</span> reads from a <span style="color: #000066;">pipe</span> <span style="color: #b1b100;">and</span> colorizes <span style="color: #000066;">each</span> line based on <span style="color: #000066;">format</span> <span style="color: #66cc66;">&#40;</span>-f<span style="color: #66cc66;">&#41;</span> parameters.
&nbsp;
Arguments:
&nbsp;
-f <span style="color: #ff0000;">'&lt;regex&gt;=&lt;color&gt;'</span>  Required, multiple <span style="color: #000066;">values</span> okay. 
&nbsp;
        <span style="color: #009999;">&lt;regex&gt;</span>: A regular expression from which \$<span style="color: #cc66cc;">1</span> will be colorized
&nbsp;
        <span style="color: #009999;">&lt;color&gt;</span>: One <span style="color: #b1b100;">or</span> more colorization keywords, see perldoc
        Term::<span style="color: #006600;">ANSIColor</span>, but briefly they are:
&nbsp;
        boldness:
                bold
        foreground:
                red yellow green blue magenta cyan black white
        background:
                on_red on_yellow on_green on_blue on_magenta on_cyan
                on_black on_white
&nbsp;
Examples:
&nbsp;
        <span style="color: #808080; font-style: italic;">#highlight the account's shell in bold green</span>
        cat <span style="color: #66cc66;">/</span>etc<span style="color: #66cc66;">/</span>passwd <span style="color: #66cc66;">|</span> $<span style="color: #cc66cc;">0</span> -f <span style="color: #ff0000;">'.+:([^:]+)\$=bold green'</span>
&nbsp;
        <span style="color: #808080; font-style: italic;">#... and the username in red with black background</span>
        cat <span style="color: #66cc66;">/</span>etc<span style="color: #66cc66;">/</span>passwd <span style="color: #66cc66;">|</span> $<span style="color: #cc66cc;">0</span> -f <span style="color: #ff0000;">'([^:]+)=red on_black'</span> -f <span style="color: #ff0000;">'.+:([^:]+)\$=bold green'</span>
&nbsp;
Copyright<span style="color: #66cc66;">/</span>License:
&nbsp;
        Allen Day <span style="color: #66cc66;">&lt;</span>allenday\<span style="color: #0000ff;">@ucla</span>.edu<span style="color: #66cc66;">&gt;</span>, licensed under GPL <span style="color: #cc66cc;">2006</span><span style="color: #cc66cc;">-2008</span>
&nbsp;
EOF
  <span style="color: #000066;">exit</span><span style="color: #66cc66;">&#40;</span><span style="color: #cc66cc;">1</span><span style="color: #66cc66;">&#41;</span>;
<span style="color: #66cc66;">&#125;</span>
&nbsp;
<span style="color: #b1b100;">while</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$line</span> = <span style="color: #66cc66;">&lt;&gt;</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
  <span style="color: #000066;">chomp</span><span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$line</span> <span style="color: #66cc66;">&#41;</span>;
  <span style="color: #b1b100;">foreach</span> <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$f</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #000066;">keys</span> <span style="color: #0000ff;">%format</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
    <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">@c</span> = <span style="color: #000066;">split</span> <span style="color: #ff0000;">','</span>, <span style="color: #0000ff;">$format</span><span style="color: #66cc66;">&#123;</span> <span style="color: #0000ff;">$f</span> <span style="color: #66cc66;">&#125;</span>;
&nbsp;
    <span style="color: #b1b100;">if</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$line</span> =~ <span style="color: #000066;">qr</span><span style="color: #66cc66;">/</span><span style="color: #0000ff;">$f</span><span style="color: #66cc66;">/</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
      <span style="color: #b1b100;">while</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #b1b100;">my</span> <span style="color: #66cc66;">&#40;</span> <span style="color: #0000ff;">$s</span>, <span style="color: #0000ff;">$t</span> <span style="color: #66cc66;">&#41;</span> = <span style="color: #0000ff;">$f</span> =~ <span style="color: #000066;">m</span><span style="color: #66cc66;">/</span>^<span style="color: #66cc66;">&#40;</span>.<span style="color: #66cc66;">*</span>?<span style="color: #66cc66;">&#41;</span>\<span style="color: #66cc66;">&#40;</span>+<span style="color: #66cc66;">&#40;</span>.+?<span style="color: #66cc66;">&#41;</span>\<span style="color: #66cc66;">&#41;</span>+<span style="color: #66cc66;">/</span> <span style="color: #66cc66;">&#41;</span> <span style="color: #66cc66;">&#123;</span>
        <span style="color: #b1b100;">my</span> <span style="color: #0000ff;">$c</span> = <span style="color: #000066;">pop</span> <span style="color: #0000ff;">@c</span> <span style="color: #66cc66;">||</span> <span style="color: #b1b100;">last</span>;
        <span style="color: #0000ff;">$line</span> =~ <span style="color: #000066;">s</span><span style="color: #66cc66;">/</span><span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$s</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">&#40;</span><span style="color: #0000ff;">$t</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">/</span>$<span style="color: #cc66cc;">1</span>.colored<span style="color: #66cc66;">&#40;</span>$<span style="color: #cc66cc;">2</span>,<span style="color: #0000ff;">$c</span><span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">/</span>e;
        <span style="color: #0000ff;">$f</span> =~ <span style="color: #000066;">s</span><span style="color: #66cc66;">/</span>^<span style="color: #66cc66;">&#40;</span>.<span style="color: #66cc66;">*</span>?<span style="color: #66cc66;">&#41;</span>\<span style="color: #66cc66;">&#40;</span><span style="color: #66cc66;">&#40;</span>.+?<span style="color: #66cc66;">&#41;</span>\<span style="color: #66cc66;">&#41;</span><span style="color: #66cc66;">/</span>$<span style="color: #cc66cc;">1</span>$<span style="color: #cc66cc;">2</span><span style="color: #66cc66;">/</span>;
      <span style="color: #66cc66;">&#125;</span>
    <span style="color: #66cc66;">&#125;</span>
  <span style="color: #66cc66;">&#125;</span>
  <span style="color: #000066;">print</span> <span style="color: #ff0000;">&quot;$line<span style="color: #000099; font-weight: bold;">\n</span>&quot;</span>;
<span style="color: #66cc66;">&#125;</span></pre></div></div>

]]></content:encoded>
			<wfw:commentRss>http://www.spicylogic.com/allenday/blog/2008/07/05/pcoc-piped-command-output-colorizer/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>
