<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>pig中文教程 &#8211; 编码无悔 /  Intent &amp; Focused</title>
	<atom:link href="https://www.codelast.com/tag/pig%E4%B8%AD%E6%96%87%E6%95%99%E7%A8%8B/feed/" rel="self" type="application/rss+xml" />
	<link>https://www.codelast.com</link>
	<description>最优化之路</description>
	<lastBuildDate>Tue, 10 May 2022 10:17:55 +0000</lastBuildDate>
	<language>zh-Hans</language>
	<sy:updatePeriod>
	hourly	</sy:updatePeriod>
	<sy:updateFrequency>
	1	</sy:updateFrequency>
	<generator>https://wordpress.org/?v=6.9.4</generator>
	<item>
		<title>[原创]Apache Pig的一些基础概念及用法总结（2）</title>
		<link>https://www.codelast.com/%e5%8e%9f%e5%88%9bapache-pig%e7%9a%84%e4%b8%80%e4%ba%9b%e5%9f%ba%e7%a1%80%e6%a6%82%e5%bf%b5%e5%8f%8a%e7%94%a8%e6%b3%95%e6%80%bb%e7%bb%93%ef%bc%882%ef%bc%89/</link>
					<comments>https://www.codelast.com/%e5%8e%9f%e5%88%9bapache-pig%e7%9a%84%e4%b8%80%e4%ba%9b%e5%9f%ba%e7%a1%80%e6%a6%82%e5%bf%b5%e5%8f%8a%e7%94%a8%e6%b3%95%e6%80%bb%e7%bb%93%ef%bc%882%ef%bc%89/#comments</comments>
		
		<dc:creator><![CDATA[learnhard]]></dc:creator>
		<pubDate>Thu, 05 Apr 2012 06:45:06 +0000</pubDate>
				<category><![CDATA[Linux]]></category>
		<category><![CDATA[原创]]></category>
		<category><![CDATA[apache pig]]></category>
		<category><![CDATA[Pig Latin]]></category>
		<category><![CDATA[pig tutorial in Chinese]]></category>
		<category><![CDATA[pig中文教程]]></category>
		<guid isPermaLink="false">http://www.codelast.com/?p=4611</guid>

					<description><![CDATA[<p>查看更多Apache Pig的教程请点击<a href="https://www.codelast.com/?p=4550" rel="noopener noreferrer" target="_blank"><span style="background-color: rgb(255, 160, 122);">这里</span></a>。</p>
<p><span style="background-color: rgb(0, 255, 0);">▶▶</span>&#160;LIMIT操作并不会减少读入的数据量<br />
如果你只需要输出一个小数据集，通常你可以使用LIMIT来实现，例如：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &#34;Helvetica Neue&#34;, Helvetica, &#34;Hiragino Sans GB&#34;, &#34;Microsoft YaHei&#34;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&#160;=&#160;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&#160;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&#160;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&#160;(col1:&#160;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&#160;col2:&#160;chararray);
B&#160;=&#160;LIMIT&#160;A&#160;5;
DUMP&#160;B;
</code></pre>
</section>
<p>Pig会只加载5条记录，就不再读取其他的记录了吗？答案是：不会。Pig将读取数据文件中的所有记录，然后再从中挑5条。这是Pig可以做优化、却没有做的一点。<br />
<span style="color:#800080;">【更新】</span>Pig 0.10已经有了这功能了：</p>
<blockquote>
<div>
		Push Limit into Loader</div>
<div>
		Pig optimizes limit query by pushing limit automatically to the loader, thus requiring only a fraction of the entire input to be scanned.</div></blockquote>&#8230; <a href="https://www.codelast.com/%e5%8e%9f%e5%88%9bapache-pig%e7%9a%84%e4%b8%80%e4%ba%9b%e5%9f%ba%e7%a1%80%e6%a6%82%e5%bf%b5%e5%8f%8a%e7%94%a8%e6%b3%95%e6%80%bb%e7%bb%93%ef%bc%882%ef%bc%89/" class="read-more">Read More </a>]]></description>
										<content:encoded><![CDATA[<p>查看更多Apache Pig的教程请点击<a href="https://www.codelast.com/?p=4550" rel="noopener noreferrer" target="_blank"><span style="background-color: rgb(255, 160, 122);">这里</span></a>。</p>
<p><span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;LIMIT操作并不会减少读入的数据量<br />
如果你只需要输出一个小数据集，通常你可以使用LIMIT来实现，例如：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);
B&nbsp;=&nbsp;LIMIT&nbsp;A&nbsp;5;
DUMP&nbsp;B;
</code></pre>
</section>
<p>Pig会只加载5条记录，就不再读取其他的记录了吗？答案是：不会。Pig将读取数据文件中的所有记录，然后再从中挑5条。这是Pig可以做优化、却没有做的一点。<br />
<span style="color:#800080;">【更新】</span>Pig 0.10已经有了这功能了：</p>
<blockquote>
<div>
		Push Limit into Loader</div>
<div>
		Pig optimizes limit query by pushing limit automatically to the loader, thus requiring only a fraction of the entire input to be scanned.</div>
</blockquote>
<div>
	按我的理解，上面这段话的含义是：Pig将LIMIT查询自动优化到loader中，这样就只会扫描整个输入数据集的一部分（而不是全部）。</div>
<p><span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;使用UDF不一定要在Pig脚本中REGISTER，也可以在命令行指定<br />
大家知道，使用UDF需要在Pig脚本中REGISTER该UDF的jar包，但你可能不知道，你也可以不在Pig脚本中REGISTER它，而是通过命令行指定：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">pig&nbsp;-Dpig.additional.jars=/home/codelast/a.jar:/home/codelast/b.jar:/home/codelast/c.jar&nbsp;test.pig
</code></pre>
</section>
<p>以上命令告诉了我们几件事：<br />
<strong><span style="color:#ff0000;">①</span></strong>我们让Pig执行了test.pig脚本；<br />
<strong><span style="color:#ff0000;">②</span></strong>我们向Pig传入了&ldquo;pig.additional.jars&rdquo;这样一个参数，此参数的作用相当于在Pig脚本中REGISTER jar包；<br />
<strong><span style="color:#ff0000;">③</span></strong>如果你要REGISTER多个jar包，只需像上面的例子一样，用分号(:)把多个jar包路径隔开即可；<br />
<strong><span style="color:#ff0000;">④</span></strong>test.pig必须写在最后，而不能写成&ldquo;<span style="color:#0000ff;">pig test.pig -Dpig.additional.jars=XXX</span>&rdquo;这样，否则Pig直接报错：</p>
<blockquote>
<p>
		ERROR 2999: Unexpected internal error. Encountered unexpected arguments on command line - please check the command line.</p>
</blockquote>
<p>当然，为了可维护性好，你最好把REGISTER jar包写在Pig脚本中，不要通过命令行传入。<br />
<span id="more-4611"></span><br />
<span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;使用ORDER排序时，null会比所有值都小<br />
用ORDER按一个字段排序，如果该字段的所有值中有null，那么null会比其他值都小。</p>
<p><span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;如何按指定的几个字段来去重<br />
去重，即去除重复的记录。通常，我们使用DISTINCT来去除整行重复的记录，但是，如果我们只想用几个字段来去重，怎么做？<br />
假设有以下数据文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]$&nbsp;cat&nbsp;1.txt&nbsp;
1&nbsp;&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;uoip
1&nbsp;&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;jklm
9&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;sdfa
8&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;dddd
9&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;qqqq
8&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;sfew
</code></pre>
</section>
<p>我们要按第1、2、3、4个字段来去重，也就是说，去重结果应为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">1&nbsp;&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;uoip
9&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;sdfa
8&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;dddd
</code></pre>
</section>
<p>那么，我们可以这样做：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;chararray,&nbsp;col2:&nbsp;chararray,&nbsp;col3:&nbsp;chararray,&nbsp;col4:&nbsp;chararray,&nbsp;col5:&nbsp;chararray);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;(col1,&nbsp;col2,&nbsp;col3,&nbsp;col4);
C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;{
&nbsp;&nbsp;&nbsp;&nbsp;D&nbsp;=&nbsp;LIMIT&nbsp;A&nbsp;1;
&nbsp;&nbsp;&nbsp;&nbsp;GENERATE&nbsp;FLATTEN(D);
};
DUMP&nbsp;C;
</code></pre>
</section>
<p><span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(1,2,3,4,uoip)
(8,8,8,8,dddd)
(9,7,5,3,sdfa)
</code></pre>
</section>
<p>代码很简单，就是利用了GROUP时会自动对group的key去重的功能，这里不用多解释大家应该也能看懂。</p>
<p><span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;如何设置Pig job的名字，使得在Hadoop jobtracker中可以清晰地识别出来<br />
在Pig脚本中的一开始处，写上这一句：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">set</span>&nbsp;job.name&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;This&nbsp;is&nbsp;my&nbsp;job&#39;</span>;
</code></pre>
</section>
<p>将使得Pig job name被设置为&ldquo;This is my job&rdquo;，从而在Hadoop jobtracker的web界面中可以很容易地找到你的job。如果不设置的话，其名字将显示为&ldquo;<span style="color:#b22222;">PigLatin:DefaultJobName</span>&rdquo;。</p>
<p><span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;&ldquo;<span style="color:#800080;">scalar has more than one row in the output</span>&rdquo;错误的一个原因<br />
遇到了这个错误？我来演示一下如何复现这个错误。<br />
假设有两个文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]$&nbsp;cat&nbsp;a.txt&nbsp;
1&nbsp;&nbsp;&nbsp;&nbsp;2
3&nbsp;&nbsp;&nbsp;&nbsp;4
[root@localhost&nbsp;~]$&nbsp;cat&nbsp;b.txt&nbsp;
3&nbsp;&nbsp;&nbsp;&nbsp;4
5&nbsp;&nbsp;&nbsp;&nbsp;6
</code></pre>
</section>
<p>现在我们来做一个JOIN：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;b.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
C&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;col1,&nbsp;B&nbsp;BY&nbsp;col1;
D&nbsp;=&nbsp;FOREACH&nbsp;C&nbsp;GENERATE&nbsp;A.col1;
DUMP&nbsp;D;
</code></pre>
</section>
<p>这段代码是必然会fail的，错误提示为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">org.apache.pig.backend.executionengine.ExecException:&nbsp;ERROR&nbsp;0:&nbsp;Scalar&nbsp;has&nbsp;more&nbsp;than&nbsp;one&nbsp;row&nbsp;in&nbsp;the&nbsp;output.&nbsp;1st&nbsp;:&nbsp;(1,2),&nbsp;2nd&nbsp;:(3,4)
</code></pre>
</section>
<p><span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
乍一看，似乎代码简单得一点问题都没有啊？其实仔细一看，&ldquo;<span style="color:#ff0000;">A.col1</span>&rdquo;的写法根本就是错误的，应该写成&ldquo;<span style="color:#0000ff;">A::col1</span>&rdquo;才对，因为你只要 DESCRIBE 一下 C 的schema就明白了：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">C:&nbsp;{A::col1:&nbsp;int,A::col2:&nbsp;int,B::col1:&nbsp;int,B::col2:&nbsp;int}
</code></pre>
</section>
<p>Pig的这个错误提示得很不直观，在<a href="https://issues.apache.org/jira/browse/PIG-2134" rel="noopener noreferrer" target="_blank">这个链接</a>中也有人提到过了。</p>
<p><span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;如何输出LZO压缩格式的文本文件<br />
借助于elephant-bird，可以轻易完成这个工作。<br />
<span style="color:#ff0000;">方法1:</span></p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;＝<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;input&#39;</span>;
STORE&nbsp;A&nbsp;INTO&nbsp;&#39;output&#39;&nbsp;USING&nbsp;com.twitter.elephantbird.pig.store.LzoPigStorage();
</code></pre>
</section>
<p>结果就会得到一堆名称类似于&ldquo;<span style="color:#0000ff;">part-m-00000.lzo</span>&rdquo;的文件。<br />
注意以上省略了一堆的&ldquo;<span style="color:#b22222;">REGISTER XXX.jar</span>&rdquo;代码，你需要自己添加上你的jar包路径。<br />
<span style="color:#ff0000;">方法2：</span><br />
在Pig脚本的最前面添加两句话：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">set</span>&nbsp;mapred.output.compression.codec&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;com.hadoop.compression.lzo.LzopCodec&#39;</span>;
<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">set</span>&nbsp;mapred.output.compress&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;true&#39;</span>;
</code></pre>
</section>
<p>STORE的时候（不需要USING...）会自动将文件保存为LZO格式。<br />
<span style="color: rgb(255, 255, 255);">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255);">http://www.codelast.com/</span></a><br />
有人说，那加载LZO压缩的文本文件呢？很简单：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;output&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">USING</span>&nbsp;com.twitter.elephantbird.pig.store.LzoPigStorage(<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;,&#39;</span>);
</code></pre>
</section>
<p>这表示指定了分隔符为逗号，如果不想指定，省略括号中的内容即可。</p>
<p><span style="background-color:#00ff00;"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;<span style="font-family: Ubuntu;">如何输出 gz 及 bz2 压缩格式的文件</span><br />
首先请看<a href="http://pig.apache.org/docs/r0.10.0/api/org/apache/pig/builtin/PigStorage.html" rel="noopener noreferrer" target="_blank"><span style="background-color:#ffa07a;">这个</span></a>链接的说明。摘录一段话：</p>
<blockquote>
<div>
		Compression</div>
<div>
		&nbsp;</div>
<div>
		Storing to a directory whose name ends in &quot;.bz2&quot; or &quot;.gz&quot; or &quot;.lzo&quot; (if you have installed support for LZO compression in Hadoop) will automatically use the corresponding compression codec.</div>
<div>
		output.compression.enabled and output.compression.codec job properties also work.</div>
<div>
		Loading from directories ending in .bz2 or .bz works automatically; other compression formats are not auto-detected on loading.</div>
</blockquote>
<div>
	说得简单点就是：当你把保存的目录名设置为以 .bz2 或 .gz 结尾时，输出的文件就自动会被压缩为对应的文件格式。<br />
	因此，我就有了下面这两段极其简单的示例代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">--压缩率稍低</span>
A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>;
STORE&nbsp;A&nbsp;INTO&nbsp;&#39;z.gz&#39;;

<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">--压缩率较高</span>
A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>;
STORE&nbsp;A&nbsp;INTO&nbsp;&#39;z.bz2&#39;;
</code></pre>
</section>
<p>正如上面的注释所示，bz2 的压缩率比 gz 高。<br />
	最后生成的目录下，文件名类似于：</p>
<blockquote>
<div>
			part-m-00000.gz</div>
<div>
			part-m-00001.gz</div>
<div>
			part-m-00002.gz</div>
</blockquote>
</div>
<p>如果是 bz2，则后缀名为 .bz2。</p>
<p>有人可能觉得这种通过目录名实现的方式不直观，那么你也可以在Pig脚本中指定输入文件的压缩格式。下面的例子演示了如何输出gzip格式的压缩文件（此时，目录名就不用以.gz结尾了）：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">set</span>&nbsp;output.compression.enabled&nbsp;<span class="hljs-literal" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(174, 135, 250); word-wrap: inherit !important; word-break: inherit !important;">true</span>;
<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">set</span>&nbsp;output.compression.codec&nbsp;org.apache.hadoop.io.compress.GzipCodec;
</code></pre>
</section>
<p><span style="color: rgb(255, 255, 255);">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255);">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;<span style="font-family: Ubuntu;">类似于Java字符串的contains方法，在Pig中怎么做<br />
例如，要判断一个字符串&ldquo;abc&rdquo;是否包含&ldquo;bc&rdquo;，怎么做？<br />
有一个方法，利用Pig内置的</span>INDEXOF函数来实现：</p>
<blockquote>
<p>
		<span style="color:#0000ff;">INDEXOF(string, &#39;character&#39;, startIndex)</span></p>
<div>
		string: The string to be searched.</div>
<div>
		&#39;character&#39;: The character being searched for, in quotes.</div>
<div>
		startIndex: The index from which to begin the forward search. The string index begins with zero (0).</div>
</blockquote>
<div>
	即：string 相当于上面所说的字符串&ldquo;abc&rdquo;，&#39;character&#39;相当于上面所说的字符串&ldquo;bc&rdquo;，startIndex设置为0即可，表示从字符串&ldquo;abc&rdquo;的头开始搜索。<br />
	如果能查到字符串，则返回值为搜索到的字符串的起始索引位置；如果查不到字符串，则返回值为-1。<br />
	这里给一个例子。输入数据为：</div>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost]&nbsp;~$&nbsp;cat&nbsp;a.txt&nbsp;
abc&nbsp;&nbsp;&nbsp;&nbsp;123
</code></pre>
</section>
<p><span style="font-family: Ubuntu;">Pig脚本：</span></p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;chararray,&nbsp;col2:&nbsp;chararray);
B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;INDEXOF(col1,&nbsp;&#39;abc&#39;,&nbsp;0),&nbsp;INDEXOF(col2,&nbsp;&#39;9&#39;,&nbsp;0);
DUMP&nbsp;B;
</code></pre>
</section>
<p><span style="font-family: Ubuntu;">输出结果：</span></p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(0,-1)
</code></pre>
</section>
<p><span style="font-family: Ubuntu;">可见，查不到的字符串，其返回值为-1。</span></p>
<p><span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;<span style="font-family: Ubuntu;">类型为long的字段做FILTER的时候，数字没有加&ldquo;L&rdquo;可能会导致结果错误！<br />
我遇到的一个真实问题，下面的代码：</span></p>
<div>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;/path/to/my/data&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">USING</span>&nbsp;MyLOADER;
B&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;(fieldId&nbsp;==&nbsp;3125694275663397348L&nbsp;OR&nbsp;fieldId&nbsp;==&nbsp;3125694275741155094L);
</code></pre>
</section>
</div>
<p><span style="font-family: Ubuntu;">对LOAD进来的数据进行了一个简单的FILTER，其中，<span style="color:#0000ff;">fieldId</span> 是一个类型为 long 的字段，这样写没有任何问题。但是如果把FILTER条件里的数字后面的&ldquo;<span style="color:#ff0000;">L</span>&rdquo;去掉，Pig不会报错，但FILTER语句可能就会失效。所以在写字面数字的时候，一定要为long类型后面加上&ldquo;<span style="color:#ff0000;">L</span>&rdquo;！</span></p>
<p><span style="background-color: rgb(0, 255, 0);"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/25b6.png" alt="▶" class="wp-smiley" style="height: 1em; max-height: 1em;" /></span>&nbsp;<span style="font-family: Ubuntu;">加载Sequence file不能当作纯文本文件来加载</span><br />
假设Sequence file是 TAB制表符&nbsp;分隔的两列数据，正确加载它可以这样写：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;xxx&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">USING</span>&nbsp;com.twitter.elephantbird.pig.load.SequenceFileLoader();
<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">--&nbsp;SequenceFileLoader加载出来的数据，自动会加上schema：{key:&nbsp;chararray,value:&nbsp;chararray}</span>
B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;key&nbsp;AS&nbsp;col1,&nbsp;value&nbsp;AS&nbsp;col2;
C&nbsp;=&nbsp;LIMIT&nbsp;B&nbsp;10;
DUMP&nbsp;C;
</code></pre>
</section>
<p>如果你把它当作未压缩的纯文本文件来加载（不使用USING XXX），则会发现job不报错，但实际上读出来的数据都是乱码。<br />
<span style="color: rgb(255, 255, 255);">文章来源：</span><a href="https://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255);">https://www.codelast.com/</span></a><br />
<span style="color: rgb(255, 0, 0);">➤➤</span>&nbsp;版权声明&nbsp;<span style="color: rgb(255, 0, 0);">➤➤</span>&nbsp;<br />
转载需注明出处：<u><a href="https://www.codelast.com/" rel="noopener noreferrer" target="_blank"><em><span style="color: rgb(0, 0, 255);"><strong style="font-size: 16px;"><span style="font-family: arial, helvetica, sans-serif;">codelast.com</span></strong></span></em></a></u>&nbsp;<br />
感谢关注我的微信公众号（微信扫一扫）：<br />
<img decoding="async" alt="wechat qrcode of codelast" src="https://www.codelast.com/codelast_wechat_qr_code.jpg" style="color: rgb(77, 77, 77); font-size: 13px; width: 200px; height: 200px;" /><br />
以及我的微信视频号：</p>
<p style="border: 0px; font-size: 13px; margin: 0px 0px 9px; outline: 0px; padding: 0px; color: rgb(77, 77, 77);">
	<img decoding="async" alt="" src="https://www.codelast.com/wechat_shipinhao_qr_code.jpg" style="text-align: center; width: 200px; height: 199px;" /></p>
]]></content:encoded>
					
					<wfw:commentRss>https://www.codelast.com/%e5%8e%9f%e5%88%9bapache-pig%e7%9a%84%e4%b8%80%e4%ba%9b%e5%9f%ba%e7%a1%80%e6%a6%82%e5%bf%b5%e5%8f%8a%e7%94%a8%e6%b3%95%e6%80%bb%e7%bb%93%ef%bc%882%ef%bc%89/feed/</wfw:commentRss>
			<slash:comments>2</slash:comments>
		
		
			</item>
		<item>
		<title>[原创]Apache Pig的一些基础概念及用法总结（1）</title>
		<link>https://www.codelast.com/%e5%8e%9f%e5%88%9bpig%e4%b8%ad%e7%9a%84%e4%b8%80%e4%ba%9b%e5%9f%ba%e7%a1%80%e6%a6%82%e5%bf%b5%e6%80%bb%e7%bb%93/</link>
					<comments>https://www.codelast.com/%e5%8e%9f%e5%88%9bpig%e4%b8%ad%e7%9a%84%e4%b8%80%e4%ba%9b%e5%9f%ba%e7%a1%80%e6%a6%82%e5%bf%b5%e6%80%bb%e7%bb%93/#comments</comments>
		
		<dc:creator><![CDATA[learnhard]]></dc:creator>
		<pubDate>Fri, 23 Sep 2011 09:42:47 +0000</pubDate>
				<category><![CDATA[Linux]]></category>
		<category><![CDATA[原创]]></category>
		<category><![CDATA[apache pig]]></category>
		<category><![CDATA[pig中文教程]]></category>
		<category><![CDATA[UDF]]></category>
		<category><![CDATA[中文教程]]></category>
		<category><![CDATA[基础]]></category>
		<guid isPermaLink="false">http://www.codelast.com/?p=3621</guid>

					<description><![CDATA[<p>查看更多Apache Pig的教程请点击<a href="https://www.codelast.com/?p=4550" rel="noopener noreferrer" target="_blank"><span style="background-color: rgb(255, 160, 122);">这里</span></a>。</p>
<p>本文可以让刚接触pig的人对一些基础概念有个初步的了解。<br />
很久很久以前，本文大概是互联网上第一篇公开发表的且涵盖大量实际例子的Apache Pig中文教程（由Google搜索可知），文中的大量实例都是作者Darran Zhang（website: codelast.com）在工作、学习中总结的经验或解决的问题，并且添加了较为详尽的说明及注解，希望能帮助一部分人。</p>
<p><a href="http://pig.apache.org/" rel="noopener noreferrer" target="_blank">Apache pig</a>是用来处理大规模数据的高级查询语言，配合Hadoop使用，可以在处理海量数据时达到事半功倍的效果，比使用Java，C++等语言编写大规模数据处理程序的难度要小N倍，实现同样的效果的代码量也小N倍。<br />
但是刚接触pig时，可能会觉得里面的某些概念以及程序实现方法与想像中的很不一样，所以，你需要仔细地研究一下基础概念，这样在写pig程序的时候，才不会觉得非常别扭。<br />
<span id="more-3621"></span><br />
本文大部分内容基于 Pig 0.8.1 写作而成。众所周知，这个版本的Pig现在已经非常过时，但是，无论新版旧版，Pig的基础用法在很多情况下都是一致的，所以，这并不影响你学习。<br />
本文的部分内容来自Pig官方文档，但涉及到翻译的部分，也是我自己翻译的，因此可能理解与英文有偏差，如果你觉得有疑义，可参考英文内容。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
下面开始学习Pig。</p>
<p><span style="background-color:#00ff00;">➤</span> 关系（relation）、包（bag）、元组（tuple）、字段（field）、数据（data）的关系</p>
<ul>
<li>
		一个关系（relation）是一个包（bag），更具体地说，是一个外部的包（outer bag）。</li>
<li>
		一个包（bag）是一个元组（tuple）的集合。<span style="color:#0000cd;">在pig中表示数据时，用大括号{}括起来的东西表示一个包&#8212;&#8212;无论是在教程中的实例演示，还是在pig交互模式下的输出，都遵循这样的约定，请牢记这一点，因为不理解的话就会对数据结构的掌握产生偏差</span>。</li>
<li>
		一个元组（tuple）是若干字段（field）的一个有序集（ordered set）。<span style="color:#0000cd;">在pig中表示数据时，用小括号()括起来的东西表示一个元组</span>。</li>
<li>
		一个字段是一块数据（data）。</li>
</ul>
<p>&#8220;元组&#8221;这个词很抽象，你可以把它想像成关系型数据库表中的一行，它含有一个或多个字段，其中，每一个字段可以是任何数据类型，并且可以有或者没有数据。<br />
&#8220;关系&#8221;可以比喻成关系型数据库的一张表，而上面说了，&#8220;元组&#8221;可以比喻成数据表中的一行，那么这里有人要问了，在关系型数据库中，同一张表中的每一行都有固定的字段数，pig中的&#8220;关系&#8221;与&#8220;元组&#8221;之间，是否也是这样的情况呢？不是的。&#8220;关系&#8221;并不要求每一个&#8220;元组&#8221;都含有相同数量的字段，并且也不会要求各&#8220;元组&#8221;中在相同位置处的字段具有相同的数据类型（太随意了，是吧？）<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);">➤</span>&#160;一个 计算多维度组合下的平均值 的实际例子<br />
为了帮助大家理解pig的一个基本的数据处理流程，我造了一些简单的数据来举个例子&#8212;&#8212;<br />
假设有数据文件：a.txt（各数值之间是以tab分隔的）：&#8230; <a href="https://www.codelast.com/%e5%8e%9f%e5%88%9bpig%e4%b8%ad%e7%9a%84%e4%b8%80%e4%ba%9b%e5%9f%ba%e7%a1%80%e6%a6%82%e5%bf%b5%e6%80%bb%e7%bb%93/" class="read-more">Read More </a></p>]]></description>
										<content:encoded><![CDATA[<p>查看更多Apache Pig的教程请点击<a href="https://www.codelast.com/?p=4550" rel="noopener noreferrer" target="_blank"><span style="background-color: rgb(255, 160, 122);">这里</span></a>。</p>
<p>本文可以让刚接触pig的人对一些基础概念有个初步的了解。<br />
很久很久以前，本文大概是互联网上第一篇公开发表的且涵盖大量实际例子的Apache Pig中文教程（由Google搜索可知），文中的大量实例都是作者Darran Zhang（website: codelast.com）在工作、学习中总结的经验或解决的问题，并且添加了较为详尽的说明及注解，希望能帮助一部分人。</p>
<p><a href="http://pig.apache.org/" rel="noopener noreferrer" target="_blank">Apache pig</a>是用来处理大规模数据的高级查询语言，配合Hadoop使用，可以在处理海量数据时达到事半功倍的效果，比使用Java，C++等语言编写大规模数据处理程序的难度要小N倍，实现同样的效果的代码量也小N倍。<br />
但是刚接触pig时，可能会觉得里面的某些概念以及程序实现方法与想像中的很不一样，所以，你需要仔细地研究一下基础概念，这样在写pig程序的时候，才不会觉得非常别扭。<br />
<span id="more-3621"></span><br />
本文大部分内容基于 Pig 0.8.1 写作而成。众所周知，这个版本的Pig现在已经非常过时，但是，无论新版旧版，Pig的基础用法在很多情况下都是一致的，所以，这并不影响你学习。<br />
本文的部分内容来自Pig官方文档，但涉及到翻译的部分，也是我自己翻译的，因此可能理解与英文有偏差，如果你觉得有疑义，可参考英文内容。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
下面开始学习Pig。</p>
<p><span style="background-color:#00ff00;">➤</span> 关系（relation）、包（bag）、元组（tuple）、字段（field）、数据（data）的关系</p>
<ul>
<li>
		一个关系（relation）是一个包（bag），更具体地说，是一个外部的包（outer bag）。</li>
<li>
		一个包（bag）是一个元组（tuple）的集合。<span style="color:#0000cd;">在pig中表示数据时，用大括号{}括起来的东西表示一个包&mdash;&mdash;无论是在教程中的实例演示，还是在pig交互模式下的输出，都遵循这样的约定，请牢记这一点，因为不理解的话就会对数据结构的掌握产生偏差</span>。</li>
<li>
		一个元组（tuple）是若干字段（field）的一个有序集（ordered set）。<span style="color:#0000cd;">在pig中表示数据时，用小括号()括起来的东西表示一个元组</span>。</li>
<li>
		一个字段是一块数据（data）。</li>
</ul>
<p>&ldquo;元组&rdquo;这个词很抽象，你可以把它想像成关系型数据库表中的一行，它含有一个或多个字段，其中，每一个字段可以是任何数据类型，并且可以有或者没有数据。<br />
&ldquo;关系&rdquo;可以比喻成关系型数据库的一张表，而上面说了，&ldquo;元组&rdquo;可以比喻成数据表中的一行，那么这里有人要问了，在关系型数据库中，同一张表中的每一行都有固定的字段数，pig中的&ldquo;关系&rdquo;与&ldquo;元组&rdquo;之间，是否也是这样的情况呢？不是的。&ldquo;关系&rdquo;并不要求每一个&ldquo;元组&rdquo;都含有相同数量的字段，并且也不会要求各&ldquo;元组&rdquo;中在相同位置处的字段具有相同的数据类型（太随意了，是吧？）<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;一个 计算多维度组合下的平均值 的实际例子<br />
为了帮助大家理解pig的一个基本的数据处理流程，我造了一些简单的数据来举个例子&mdash;&mdash;<br />
假设有数据文件：a.txt（各数值之间是以tab分隔的）：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;a.txt&nbsp;
a&nbsp;1&nbsp;2&nbsp;3&nbsp;4.2&nbsp;9.8
a&nbsp;3&nbsp;0&nbsp;5&nbsp;3.5&nbsp;2.1
b&nbsp;7&nbsp;9&nbsp;9&nbsp;-&nbsp;-
a&nbsp;7&nbsp;9&nbsp;9&nbsp;2.6&nbsp;6.2
a&nbsp;1&nbsp;2&nbsp;5&nbsp;7.7&nbsp;5.9
a&nbsp;1&nbsp;2&nbsp;3&nbsp;1.4&nbsp;0.2
</code></pre>
</section>
<p>问题如下：怎样求出在第2、3、4列的所有组合的情况下，最后两列的平均值分别是多少？<br />
例如，第2、3、4列有一个组合为（1，2，3），即第一行和最后一行数据。对这个维度组合来说，最后两列的平均值分别为：<br />
（4.2+1.4）/2＝2.8<br />
（9.8+0.2）/2＝5.0<br />
而对于第2、3、4列的其他所有维度组合，都分别只有一行数据，因此最后两列的平均值其实就是它们自身。<br />
特别地，组合（7，9，9）有两行记录：第三、四行，但是第三行数据的最后两列没有值，因此它不应该被用于平均值的计算，也就是说，在计算平均值时，第三行是无效数据。所以（7，9，9）组合的最后两列的平均值为 2.6 和 6.2。<br />
我们现在用pig来算一下，并且输出最终的结果。<br />
先进入本地调试模式（pig -x local），再依次输入如下pig代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;(col2,&nbsp;col3,&nbsp;col4);
C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;GENERATE&nbsp;group,&nbsp;AVG(A.col5),&nbsp;AVG(A.col6);
DUMP&nbsp;C;
</code></pre>
</section>
<p>pig输出结果如下：</p>
<blockquote>
<div>
		((1,2,3),2.8,5.0)</div>
<div>
		((1,2,5),7.7,5.9)</div>
<div>
		((3,0,5),3.5,2.1)</div>
<div>
		((7,9,9),2.6,6.2)</div>
</blockquote>
<p>这个结果对吗？手工算一下就知道是对的。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
下面，我们依次来看看每一句pig代码分别得到了什么样的数据。<br />
<strong><span style="color:#ff0000;">①</span></strong>加载 a.txt 文件，并指定每一列的数据类型分别为 chararray（字符串），int，int，int，double，double。同时，我们还给予了每一列别名，分别为 col1，col2，&hellip;&hellip;，col6。这个别名在后面的数据处理中会用到&mdash;&mdash;如果你不指定别名，那么在后面的处理中，就只能使用索引（$0，$1，&hellip;&hellip;）来标识相应的列了，这样可读性会变差，因此，在列固定的情况下，还是指定别名的好。<br />
将数据加载之后，保存到变量A中，A的数据结构如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A:&nbsp;{col1:&nbsp;chararray,col2:&nbsp;int,col3:&nbsp;int,col4:&nbsp;int,col5:&nbsp;double,col6:&nbsp;double}
</code></pre>
</section>
<p>可见，A是用大括号括起来的东西。根据本文前面的说法，A是一个包（bag）。<br />
这个时候，A与你想像中的样子应该是一致的，也就是与前面打印出来的 a.txt 文件的内容是一样的，还是一行一行的类似于&ldquo;二维表&rdquo;的数据。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<strong><span style="color:#ff0000;">②</span></strong>按照A的第2、3、4列，对A进行分组。pig会找出所有第2、3、4列的组合，并按照升序进行排列，然后将它们与对应的包A整合起来，得到如下的数据结构：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">B:&nbsp;{group:&nbsp;(col2:&nbsp;int,col3:&nbsp;int,col4:&nbsp;int),A:&nbsp;{col1:&nbsp;chararray,col2:&nbsp;int,col3:&nbsp;int,col4:&nbsp;int,col5:&nbsp;double,col6:&nbsp;double}}
</code></pre>
</section>
<p>可见，A的第2、3、4列的组合被pig赋予了一个别名：<span style="color:#0000ff;">group</span>，这很形象。同时我们也观察到，B的每一行其实就是由一个group和若干个A组成的&mdash;&mdash;注意，是若干个A。这里之所以只显示了一个A，是因为这里表示的是数据结构，而不表示具体数据有多少组。<br />
实际的数据为：</p>
<blockquote>
<div>
		((1,2,3),{(a,1,2,3,4.2,9.8),(a,1,2,3,1.4,0.2)})</div>
<div>
		((1,2,5),{(a,1,2,5,7.7,5.9)})</div>
<div>
		((3,0,5),{(a,3,0,5,3.5,2.1)})</div>
<div>
		((7,9,9),{(b,7,9,9,,),(a,7,9,9,2.6,6.2)})</div>
</blockquote>
<p>可见，与前面所说的一样，组合（1，2，3）对应了两行数据，组合（7，9，9）也对应了两行数据。<br />
这个时候，B的结构就不那么明朗了，可能与你想像中有一点不一样了。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<strong><span style="color:#ff0000;">③</span></strong>计算每一种组合下的最后两列的平均值。<br />
根据上面得到的B的数据，你可以把B想像成一行一行的数据（只不过这些行不是对称的），FOREACH 的作用是对 B 的每一行数据进行遍历，然后进行计算。<br />
GENERATE 可以理解为要生成什么样的数据，这里的 group 就是上一步操作中B的第一项数据（即pig为A的第2、3、4列的组合赋予的别名），所以它告诉了我们：在数据集 C 的每一行里，第一项就是B中的group&mdash;&mdash;类似于（1，2，5）这样的东西）。<br />
而 AVG(A.col5) 这样的计算，则是调用了pig的一个求平均值的函数 AVG，用于对 A 的名为 col5 的列求平均值。前文说了，在加载数据到A的时候，我们已经给每一列起了个别名，col5就是倒数第二列。<br />
到这里，可能有人要迷糊了：难道 AVG(A.col5) 不是表示对 A 的col5这一列求平均值吗？也就是说，在遍历B（FOREACH B）的每一行时候，计算结果都是相同的啊！<br />
事实上并不是这样。我们遍历的是B，我们需要注意到，B的数据结构中，每一行数据里，一个group对应的是若干个A，因此，这里的 A.col5，指的是B的每一行中的A，而不是包含全部数据的那个A。拿B的第一行来举例：<br />
((1,2,3),{(a,1,2,3,4.2,9.8),(a,1,2,3,1.4,0.2)})<br />
遍历到B的这一行时，要计算AVG(A.col5)，pig会找到&nbsp;(a,1,2,3,4.2,9.8) 中的4.2，以及(a,1,2,3,1.4,0.2)中的1.4，加起来除以2，就得到了平均值。<br />
同理，我们也知道了AVG(A.col6)是怎么算出来的。但还有一点要注意的：对(7,9,9)这个组，它对应的数据(b,7,9,9,,)里最后两列是无值的，这是因为我们的数据文件对应位置上不是有效数字，而是两个&ldquo;-&rdquo;，pig在加载数据的时候自动将它置为空了，并且计算平均值的时候，也不会把这一组数据考虑在内（相当于忽略这组数据的存在）。<br />
到了这里，我们不难理解，为什么C的数据结构是这样的了：</p>
<blockquote>
<p>
		C: {group: (col2: int,col3: int,col4: int),double,double}</p>
</blockquote>
<p><span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<strong><span style="color:#ff0000;">④</span></strong>DUMP C就是将C中的数据输出到控制台。如果要输出到文件，需要使用：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">STORE&nbsp;C&nbsp;INTO&nbsp;&#39;output&#39;;
</code></pre>
</section>
<p>这样pig就会在当前目录下新建一个&ldquo;output&rdquo;目录（该目录必须事先不存在），并把结果文件放到该目录下。</p>
<p>请想像一下，如果要实现相同的功能，用Java或C++写一个Map-Reduce应用程序需要多少时间？可能仅仅是写一个build.xml或者Makefile，所需的时间就是写这段pig代码的几十倍了！<br />
正因为pig有如此优势，它才得到了广泛应用。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;怎样统计数据行数<br />
在SQL语句中，要统计表中数据的行数，很简单：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">SELECT</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">COUNT</span>(*)&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">FROM</span>&nbsp;table_name&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">WHERE</span>&nbsp;condition
</code></pre>
</section>
<p>在pig中，也有一个COUNT函数，在pig手册中，对COUNT函数有这样的说明：</p>
<blockquote>
<p>
		Computes the number of elements in a bag.</p>
</blockquote>
<p>假设要计算数据文件a.txt的行数：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;a.txt&nbsp;
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4.2&nbsp;9.8
a&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;0&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;3.5&nbsp;2.1
b&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;-
a&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;2.6&nbsp;6.2
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;7.7&nbsp;5.9
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;1.4&nbsp;0.2
</code></pre>
</section>
<p>你是否可以这样做呢：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>);
B&nbsp;=&nbsp;COUNT(*);
DUMP&nbsp;B;
</code></pre>
</section>
<p>答案是：绝对不行。pig会报错。pig手册中写得很明白：</p>
<blockquote>
<p>
		Note: You cannot use the tuple designator (*) with COUNT; that is, COUNT(*) will not work.</p>
</blockquote>
<p>那么，这样对某一列计数行不行呢：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">B&nbsp;=&nbsp;COUNT(A.col2);
</code></pre>
</section>
<p>答案是：仍然不行。pig会报错。<br />
这就与我们想像中的&ldquo;正确做法&rdquo;有点不一样了：我为什么不能直接统计一个字段的数目有多少呢？刚接触pig的时候，一定非常疑惑这样明显&ldquo;不应该出错&rdquo;的写法为什么行不通。<br />
要统计A中含col2字段的数据有多少行，正确的做法是：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;ALL;
C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;GENERATE&nbsp;COUNT(A.col2);
DUMP&nbsp;C;
</code></pre>
</section>
<p>输出结果：</p>
<blockquote>
<p>
		(6)</p>
</blockquote>
<p>表明有6行数据。<br />
如此麻烦？没错。这是由pig的数据结构决定的。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
在这个例子中，统计COUNT(A.col2)和COUNT(A)的结果是一样的，但是，如果col2这一列中含有空值：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;a.txt&nbsp;
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4.2&nbsp;9.8
a&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;3.5&nbsp;2.1
b&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;-
a&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;2.6&nbsp;6.2
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;7.7&nbsp;5.9
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;1.4&nbsp;0.2
</code></pre>
</section>
<p>则以下pig程序及执行结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>);
grunt&gt;&nbsp;B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;ALL;
grunt&gt;&nbsp;C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;GENERATE&nbsp;COUNT(A.col2);
grunt&gt;&nbsp;DUMP&nbsp;C;
(5)
</code></pre>
</section>
<p>可见，结果为5行。那是因为你LOAD数据的时候指定了col2的数据类型为int，而a.txt的第二行数据是空的，因此数据加载到A以后，有一个字段就是空的：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;DUMP&nbsp;A;
(a,1,2,3,4.2,9.8)
(a,,0,5,3.5,2.1)
(b,7,9,9,,)
(a,7,9,9,2.6,6.2)
(a,1,2,5,7.7,5.9)
(a,1,2,3,1.4,0.2)
</code></pre>
</section>
<p>在COUNT的时候，null的字段不会被计入在内，所以结果是5。</p>
<blockquote>
<p>
		The COUNT function follows syntax semantics and ignores nulls. What this means is that a tuple in the bag will not be counted if the first field in this tuple is NULL. If you want to include NULL values in the count computation, use COUNT_STAR.</p>
</blockquote>
<p><span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a></p>
<div>
	<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;<span style="color:#ff0000;">FLATTEN</span>操作符的作用</div>
<div>
	这个玩意一开始还是挺让我费解的。从字面上看，flatten就是&ldquo;弄平&rdquo;的意思，但是在对一个pig的数据结构操作时，flatten到底是&ldquo;弄平&rdquo;了什么，又有什么作用呢？<br />
	我们还是采用前面的a.txt数据文件来说明：</div>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;a.txt&nbsp;
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4.2&nbsp;9.8
a&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;0&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;3.5&nbsp;2.1
b&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;-
a&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;2.6&nbsp;6.2
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;7.7&nbsp;5.9
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;1.4&nbsp;0.2
</code></pre>
</section>
<p>如果我们按照前文的做法，计算多维度组合下的最后两列的平均值，则：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>);
grunt&gt;&nbsp;B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;(col2,&nbsp;col3,&nbsp;col4);
grunt&gt;&nbsp;C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;GENERATE&nbsp;group,&nbsp;AVG(A.col5),&nbsp;AVG(A.col6);
grunt&gt;&nbsp;DUMP&nbsp;C;
((1,2,3),2.8,5.0)
((1,2,5),7.7,5.9)
((3,0,5),3.5,2.1)
((7,9,9),2.6,6.2)
</code></pre>
</section>
<p>可见，输出结果中，每一行的第一项是一个tuple（元组），我们来试试看 FLATTEN 的作用：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>);
grunt&gt;&nbsp;B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;(col2,&nbsp;col3,&nbsp;col4);
grunt&gt;&nbsp;C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;GENERATE&nbsp;FLATTEN(group),&nbsp;AVG(A.col5),&nbsp;AVG(A.col6);
grunt&gt;&nbsp;DUMP&nbsp;C;
(1,2,3,2.8,5.0)
(1,2,5,7.7,5.9)
(3,0,5,3.5,2.1)
(7,9,9,2.6,6.2)
</code></pre>
</section>
<p>看到了吗？被 FLATTEN 的group本来是一个元组，现在变成了扁平的结构了。按照pig文档的说法，FLATTEN用于对元组（tuple）和包（bag）&ldquo;解嵌套&rdquo;（un-nest）：</p>
<blockquote>
<div>
		The FLATTEN operator looks like a UDF syntactically, but it is actually an operator that changes the structure of tuples and bags in a way that a UDF cannot. Flatten un-nests tuples as well as bags. The idea is the same, but the operation and result is different for each type of structure.</div>
<div>
		&nbsp;</div>
<div>
		For tuples, flatten substitutes the fields of a tuple in place of the tuple. For example, consider a relation that has a tuple of the form (a, (b, c)). The expression GENERATE $0, flatten($1), will cause that tuple to become (a, b, c).</div>
</blockquote>
<p><span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
所以我们就看到了上面的结果。<br />
在有的时候，不&ldquo;解嵌套&rdquo;的数据结构是不利于观察的，输出这样的数据可能不利于外围数程序的处理（例如，pig将数据输出到磁盘后，我们还需要用其他程序做后续处理，而对一个元组，输出的内容里是含括号的，这就在处理流程上又要多一道去括号的工序），因此，FLATTEN提供了一个让我们在某些情况下可以清楚、方便地分析数据的机会。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;关于<span style="color:#ff0000;">GROUP</span>操作符<br />
在上文的例子中，已经演示了GROUP操作符会生成什么样的数据。在这里，需要说得更理论一些：</p>
<ul>
<li>
		用于GROUP的key如果多于一个字段（正如本文前面的例子），则GROUP之后的数据的key是一个元组（tuple），否则它就是与用于GROUP的key相同类型的东西。</li>
<li>
		GROUP的结果是一个关系（relation），在这个关系中，每一组包含一个元组（tuple），这个元组包含两个字段：<strong><span style="color:#008080;">（1）</span></strong>第一个字段被命名为&ldquo;<span style="color:#0000ff;">group</span>&rdquo;&mdash;&mdash;<span style="color:#ff0000;">这一点非常容易与GROUP关键字相混淆</span>，但请区分开来。该字段的类型与用于GROUP的key类型相同。<strong><span style="color:#008080;">（2）</span></strong>第二个字段是一个包（bag），它的类型与被GROUP的关系的类型相同。</li>
</ul>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;把数据当作&ldquo;元组&rdquo;（tuple）来加载<br />
还是假设有如下数据：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;a.txt&nbsp;
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4.2&nbsp;9.8
a&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;0&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;3.5&nbsp;2.1
b&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;-
a&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;2.6&nbsp;6.2
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;7.7&nbsp;5.9
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;1.4&nbsp;0.2
</code></pre>
</section>
<p>如果我们按照以下方式来加载数据：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>);&nbsp;
</code></pre>
</section>
<p>那么得到的A的数据结构为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;A;
A:&nbsp;{col1:&nbsp;chararray,col2:&nbsp;int,col3:&nbsp;int,col4:&nbsp;int,col5:&nbsp;double,col6:&nbsp;double}
</code></pre>
</section>
<p>如果你要把A当作一个元组（tuple）来加载：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(T&nbsp;:&nbsp;tuple&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>));
</code></pre>
</section>
<p>也就是想要得到这样的数据结构：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;A;
A:&nbsp;{T:&nbsp;(col1:&nbsp;chararray,col2:&nbsp;int,col3:&nbsp;int,col4:&nbsp;int,col5:&nbsp;double,col6:&nbsp;double)}
</code></pre>
</section>
<p>那么，上面的方法将得到一个空的A：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;DUMP&nbsp;A;
()
()
()
()
()
()
</code></pre>
</section>
<p>那是因为数据文件a.txt的结构不适合于这样加载成元组（tuple）。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
如果有数据文件b.txt：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;b.txt&nbsp;
(a,1,2,3,4.2,9.8)
(a,3,0,5,3.5,2.1)
(b,7,9,9,-,-)
(a,7,9,9,2.6,6.2)
(a,1,2,5,7.7,5.9)
(a,1,2,3,1.4,0.2)
</code></pre>
</section>
<p>则使用上面所说的加载方法及结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;b.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(T&nbsp;:&nbsp;tuple&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>));
grunt&gt;&nbsp;DUMP&nbsp;A;
((a,1,2,3,4.2,9.8))
((a,3,0,5,3.5,2.1))
((b,7,9,9,,))
((a,7,9,9,2.6,6.2))
((a,1,2,5,7.7,5.9))
((a,1,2,3,1.4,0.2))
</code></pre>
</section>
<p>可见，加载的数据的结构确实被定义成了元组（tuple）。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;在多维度组合下，如何计算某个维度组合里的不重复记录的条数<br />
以数据文件 c.txt 为例：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;c.txt&nbsp;
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4.2&nbsp;9.8&nbsp;100
a&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;0&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;3.5&nbsp;2.1&nbsp;200
b&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;300
a&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;2.6&nbsp;6.2&nbsp;300
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;7.7&nbsp;5.9&nbsp;200
a&nbsp;&nbsp;&nbsp;&nbsp;1&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;1.4&nbsp;0.2&nbsp;500
</code></pre>
</section>
<p>问题：如何计算在第2、3、4列的所有维度组合下，最后一列不重复的记录分别有多少条？例如，第2、3、4列有一个维度组合是（1，2，3），在这个维度维度下，最后一列有两种值：100 和 500，因此不重复的记录数为2。同理可求得其他的记录条数。<br />
pig代码及输出结果如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;c.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col5:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col6:<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">double</span>,&nbsp;col7:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
grunt&gt;&nbsp;B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;(col2,&nbsp;col3,&nbsp;col4);
grunt&gt;&nbsp;C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;{D&nbsp;=&nbsp;DISTINCT&nbsp;A.col7;&nbsp;GENERATE&nbsp;group,&nbsp;COUNT(D);};
grunt&gt;&nbsp;DUMP&nbsp;C;
((1,2,3),2)
((1,2,5),1)
((3,0,5),1)
((7,9,9),1)
</code></pre>
</section>
<p>我们来看看每一步分别生成了什么样的数据：<br />
<strong><span style="color:#ff0000;">①</span></strong>LOAD不用说了，就是加载数据；<br />
<strong><span style="color:#ff0000;">②</span></strong>GROUP也不用说了，和前文所说的一样。GROUP之后得到了这样的数据：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;DUMP&nbsp;B;
((1,2,3),{(a,1,2,3,4.2,9.8,100),(a,1,2,3,1.4,0.2,500)})
((1,2,5),{(a,1,2,5,7.7,5.9,200)})
((3,0,5),{(a,3,0,5,3.5,2.1,200)})
((7,9,9),{(b,7,9,9,,,300),(a,7,9,9,2.6,6.2,300)})
</code></pre>
</section>
<p>其实到这里，我们肉眼就可以看出来最后要求的结果是什么了，当然，必须要由pig代码来完成，要不然怎么应对海量数据？<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<strong><span style="color:#ff0000;">③</span></strong>这里的 FOREACH 与前面有点不一样，这就是所谓的&ldquo;<span style="color:#0000ff;">嵌套的FOREACH</span>&rdquo;。第一次看到这种写法，肯定会觉得很奇怪。先看一下用于<span style="color:#0000ff;">去重</span>的<span style="color:#ff0000;">DISTINCT</span>关键字的说明：</p>
<blockquote>
<p>
		Removes duplicate tuples in a relation.</p>
</blockquote>
<p>然后再解释一下：FOREACH 是对B的每一行进行遍历，其中，B的每一行里含有一个包（bag），每一个包中含有若干元组（tuple）A，因此，FOREACH 后面的大括号里的操作，其实是对所谓的&ldquo;内部包&rdquo;（<span style="color:#ff0000;">inner bag</span>）的操作（详情请参看<a href="http://pig.apache.org/docs/r0.8.1/piglatin_ref2.html#FOREACH" rel="noopener noreferrer" target="_blank"><span style="color:#800080;">FOREACH的说明</span></a>），在这里，我们指定了对A的col7这一列进行去重，去重的结果被命名为D，然后再对D计数（COUNT），就得到了我们想要的结果。<br />
<strong><span style="color:#ff0000;">④</span></strong>输出结果数据，与前文所述的差不多。<br />
这样就达成了我们的目的。从总体上说，刚接触pig不久的人会觉得这些写法怪怪的，就是扭不过来，但是要坚持，时间长了，连倒影也会让你觉得是正的了。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何将关系（relation）转换为标量（scalar）<br />
在前文中，我们要统计符合某些条件的数据的条数，使用了COUNT函数来计算，但在COUNT之后，我们得到的还是一个关系（relation），而不是一个标量的数字，如何把一个关系转换为标量，从而可以在后续处理中便于使用呢？<br />
具体请看<a href="http://pig.apache.org/docs/r0.8.1/piglatin_ref2.html#Casting+Relations+to+Scalars" rel="noopener noreferrer" target="_blank">这个链接</a>。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;pig中如何使用shell进行辅助数据处理<br />
pig中可以嵌套使用shell进行辅助处理，下面，就以一个实际的例子来说明。<br />
假设我们在某一步pig处理后，得到了类似于下面 b.txt 中的数据：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;b.txt&nbsp;
1&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;98&nbsp;&nbsp;=&nbsp;&nbsp;&nbsp;7
34&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;2
62&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;=&nbsp;&nbsp;&nbsp;65
</code></pre>
</section>
<p>问题：如何将数据中第4列中的&ldquo;=&rdquo;符号全部替换为9999？<br />
pig代码及输出结果如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;b.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:chararray,&nbsp;col5:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
grunt&gt;&nbsp;B&nbsp;=&nbsp;STREAM&nbsp;A&nbsp;THROUGH&nbsp;`awk&nbsp;&#39;{if($4&nbsp;==&nbsp;&quot;=&quot;)&nbsp;print&nbsp;$1&quot;\t&quot;$2&quot;\t&quot;$3&quot;\t9999\t&quot;$5;&nbsp;else&nbsp;print&nbsp;$0}&#39;`;
grunt&gt;&nbsp;DUMP&nbsp;B;
(1,5,98,9999,7)
(34,8,6,3,2)
(62,0,6,9999,65)
</code></pre>
</section>
<p>我们来看看这段代码是如何做到的：<br />
<strong><span style="color:#ff0000;">①</span></strong>加载数据，这个没什么好说的。<br />
<strong><span style="color:#ff0000;">②</span></strong>通过&ldquo;STREAM &hellip; THROUGH &hellip;&rdquo;的方式，我们可以调用一个shell语句，用该shell语句对A的每一行数据进行处理。此处的shell逻辑为：当某一行数据的第4列为&ldquo;=&rdquo;符号时，将其替换为&ldquo;9999&rdquo;；否则就照原样输出这一行。<br />
<strong><span style="color:#ff0000;">③</span></strong>输出B，可见结果正确。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;向pig脚本中传入参数<br />
假设你的pig脚本输出的文件是通过外部参数指定的，则此参数不能写死，需要传入。在pig中，使用传入的参数如下所示：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">STORE&nbsp;A&nbsp;INTO&nbsp;&#39;$output_dir&#39;;
</code></pre>
</section>
<p>则这个&ldquo;output_dir&rdquo;就是个传入的参数。在调用这个pig脚本的shell脚本中，我们可以这样传入参数：</p>
<blockquote>
<p>
		pig -param output_dir=&quot;/home/my_ourput_dir/&quot; my_pig_script.pig</p>
</blockquote>
<p>这里传入的参数&ldquo;output_dir&rdquo;的值为&ldquo;/home/my_output_dir/&rdquo;。<br />
<span style="color: rgb(255, 255, 255); font-family: arial, helvetica, sans-serif; font-size: 14px; line-height: 20px; text-align: left; background-color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;就算是同样一段pig代码，多次计算所得的结果也有可能是不同的<br />
例如用<span style="color:#ff0000;">AVG</span>函数来计算平均值时，同样一段pig代码，多次计算所得的结果中，小数点的最后几位也有可能是不相同的（当然也有可能相同），大概是因为精度的原因吧。不过，一般来说小数点的最后几位已经不重要了。例如我对一个数据集进行处理后，小数点后13位才开始有不同，这样的精度完全足够了。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何编写及使用自定义函数（UDF）<br />
请看这个链接：《<a href="http://www.codelast.com/?p=4249" rel="noopener noreferrer" target="_blank"><span style="color:#0000ff;">Apache Pig中文教程（进阶）</span></a>》</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;什么是聚合函数（Aggregate Function）<br />
在pig中，聚合函数就是那些接受一个输入包（bag），返回一个标量（scalar）值的函数。COUNT函数就是一个例子。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;COGROUP做了什么<br />
与GROUP操作符一样，<span style="color:#ff0000;"><span style="background-color:#e6e6fa;">CO</span>GROUP</span>也是用来分组的，不同的是，COGROUP可以按多个关系中的字段进行分组。<br />
还是以一个实例来说明，假设有以下两个数据文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;a.txt&nbsp;
uidk&nbsp;&nbsp;&nbsp;&nbsp;12&nbsp;&nbsp;3
hfd&nbsp;&nbsp;&nbsp;&nbsp;132&nbsp;99
bbN&nbsp;&nbsp;&nbsp;&nbsp;463&nbsp;231
UFD&nbsp;&nbsp;&nbsp;&nbsp;13&nbsp;&nbsp;10

[root@localhost&nbsp;pig]$&nbsp;cat&nbsp;b.txt&nbsp;
908&nbsp;&nbsp;&nbsp;&nbsp;uidk&nbsp;&nbsp;&nbsp;&nbsp;888
345&nbsp;&nbsp;&nbsp;&nbsp;hfd&nbsp;557
28790&nbsp;&nbsp;&nbsp;&nbsp;re&nbsp;&nbsp;00000
</code></pre>
</section>
<p>现在我们用pig做如下操作及得到的结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(acol1:chararray,&nbsp;acol2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;acol3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
grunt&gt;&nbsp;B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;b.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(bcol1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;bcol2:chararray,&nbsp;bcol3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
grunt&gt;&nbsp;C&nbsp;=&nbsp;COGROUP&nbsp;A&nbsp;BY&nbsp;acol1,&nbsp;B&nbsp;BY&nbsp;bcol2;
grunt&gt;&nbsp;DUMP&nbsp;C;
(re,{},{(28790,re,0)})
(UFD,{(UFD,13,10)},{})
(bbN,{(bbN,463,231)},{})
(hfd,{(hfd,132,99)},{(345,hfd,557)})
(uidk,{(uidk,12,3)},{(908,uidk,888)})
</code></pre>
</section>
<p>每一行输出的第一项都是分组的key，第二项和第三项分别都是一个包（bag），其中，第二项是根据前面的key找到的A中的数据包，第三项是根据前面的key找到的B中的数据包。<br />
来看看第一行输出：&ldquo;re&rdquo;作为group的key时，其找不到对应的A中的数据，因此第二项就是一个空的包&ldquo;{}&rdquo;，&ldquo;re&rdquo;这个key在B中找到了对应的数据（28790 &nbsp; &nbsp;re &nbsp; &nbsp;00000），因此第三项就是bag {(28790,re,0)}。<br />
其他输出数据也类似。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;安装pig后，运行pig命令时提示&ldquo;Cannot find hadoop configurations in classpath&rdquo;等错误的解决办法<br />
pig安装好后，运行pig命令时提示以下错误：</p>
<blockquote>
<p>
		ERROR org.apache.pig.Main - ERROR 4010: Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath).If you plan to use local mode, please put -x local option in command line</p>
</blockquote>
<p>显而易见，提示找不到与hadoop相关的配置文件。所以我们需要把hadoop安装目录下的&ldquo;conf&rdquo;子目录添加到系统环境变量PATH中：<br />
修改 /etc/profile 文件，添加：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="bash language-bash hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">export</span>&nbsp;HADOOP_HOME=/usr/<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">local</span>/hadoop
<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">export</span>&nbsp;PIG_CLASSPATH=<span class="hljs-variable" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(98, 151, 85); word-wrap: inherit !important; word-break: inherit !important;">$HADOOP_HOME</span>/conf

PATH=<span class="hljs-variable" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(98, 151, 85); word-wrap: inherit !important; word-break: inherit !important;">$JAVA_HOME</span>/bin:<span class="hljs-variable" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(98, 151, 85); word-wrap: inherit !important; word-break: inherit !important;">$HADOOP_HOME</span>/bin:<span class="hljs-variable" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(98, 151, 85); word-wrap: inherit !important; word-break: inherit !important;">$PIG_CLASSPATH</span>:<span class="hljs-variable" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(98, 151, 85); word-wrap: inherit !important; word-break: inherit !important;">$PATH</span></code></pre>
</section>
<p>然后重新加载 /etc/profile 文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="bash language-bash hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">source</span>&nbsp;/etc/profile
</code></pre>
</section>
<p><span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color:#ffffff;">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;piggybank是什么东西</p>
<blockquote>
<p>
		Pig also hosts a UDF repository called piggybank that allows users to share UDFs that they have written.</p>
</blockquote>
<p>说白了就是Apache把大家写的自定义函数放在一块儿，起了个名字，就叫做piggybank。你可以把它理解为一个SVN代码仓库。具体请看<a href="https://cwiki.apache.org/confluence/display/PIG/PiggyBank" rel="noopener noreferrer" target="_blank"><span style="color:#ff0000;">这里</span></a>。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;UDF的构造函数会被调用几次<br />
你可能会想在UDF的构造函数中做一些初始化的工作，例如创建一些文件，等等。但是你不能假设UDF的构造函数只被调用一次，因此，如果你要在构造函数中做一些只能做一次的工作，你就要当心了&mdash;&mdash;可能会导致错误。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;LOAD数据时，如何一次LOAD多个目录下的数据<br />
例如，我要LOAD两个HDFS目录下的数据：/abc/2010 和 /abc/2011，则我们可以这样写LOAD语句：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;/abc/201{0,1}&#39;</span>;
</code></pre>
</section>
<p>
<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;怎样自己写一个UDF中的加载函数(load function)<br />
请看这个链接：《<a href="http://www.codelast.com/?p=4249" rel="noopener noreferrer" target="_blank"><span style="color: rgb(0, 0, 255); ">Apache Pig中文教程（进阶）</span></a>》</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;重载(overloading)一个UDF<br />
请看这个链接：《<a href="http://www.codelast.com/?p=4249" rel="noopener noreferrer" target="_blank"><span style="color: rgb(0, 0, 255); ">Apache Pig中文教程（进阶）</span></a>》。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;pig运行不起来，提示&ldquo;<span style="color:#b22222;">org.apache.hadoop.ipc.Client - Retrying connect to server:&nbsp;</span><br />
请看这个链接：《<a href="http://www.codelast.com/?p=4249" rel="noopener noreferrer" target="_blank"><span style="color: rgb(0, 0, 255); ">Apache Pig中文教程（进阶）</span></a>》</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;用含有null的字段来GROUP，结果会如何<br />
假设有数据文件 a.txt 内容为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">1&nbsp;&nbsp;&nbsp;&nbsp;2&nbsp;&nbsp;&nbsp;5
1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3
1&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;
6&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;8
</code></pre>
</section>
<p>其中，每两列数据之间是用tab分割的，第二行的第2列、第三行的第3列没有内容（也就是说，加载到Pig里之后，对应的数据会变成null），如果把这些数据按第1、第2列来GROUP的话，第1、2列中含有null的行会被忽略吗？<br />
来做一下试验：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;(col1,&nbsp;col2);
DUMP&nbsp;B;
</code></pre>
</section>
<p>输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">((1,2),{(1,2,5)})
((1,3),{(1,3,)})
((1,),{(1,,3)})
((6,9),{(6,9,8)})
</code></pre>
</section>
<p>从上面的结果（第三行）可见，原数据中第1、2列里含有null的行也被计入在内了，也就是说，GROUP操作是不会忽略null的，这与COUNT有所不同（见本文前面的部分）。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何统计数据中某些字段的组合有多少种<br />
假设有如下数据：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;a.txt&nbsp;</span>
1&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;7
1&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;4
2&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;0&nbsp;&nbsp;&nbsp;5
9&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;6
</code></pre>
</section>
<p>现在我们要统计第1、2列的不同组合有多少种，对本例来说，组合有三种：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">1&nbsp;&nbsp;&nbsp;&nbsp;3
2&nbsp;&nbsp;&nbsp;&nbsp;7
9&nbsp;&nbsp;&nbsp;&nbsp;8
</code></pre>
</section>
<p>也就是说我们要的答案是3。<br />
用Pig怎么计算？<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
先写出全部的Pig代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col4:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;(col1,&nbsp;col2);&nbsp;
C&nbsp;=&nbsp;GROUP&nbsp;B&nbsp;ALL;
D&nbsp;=&nbsp;FOREACH&nbsp;C&nbsp;GENERATE&nbsp;COUNT(B);&nbsp;
DUMP&nbsp;D;
</code></pre>
</section>
<p>然后再来看看这些代码是如何计算出上面的结果的：<br />
<strong><span style="color:#ff0000;">①</span></strong>第一行代码加载数据，没什么好说的。<br />
<strong><span style="color:#ff0000;">②</span></strong>第二行代码，得到第1、2列数据的所有组合。B的数据结构为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;B;
B:&nbsp;{group:&nbsp;(col1:&nbsp;int,col2:&nbsp;int),A:&nbsp;{col1:&nbsp;int,col2:&nbsp;int,col3:&nbsp;int,col4:&nbsp;int}}
</code></pre>
</section>
<p>把B DUMP出来，得到：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">((1,3),{(1,3,4,7),(1,3,5,4)})
((2,7),{(2,7,0,5)})
((9,8),{(9,8,6,6)})
</code></pre>
</section>
<p>非常明显，(1,3)，(2,7)，(9,8)的所有组合已经被排列出来了，这里得到了若干行数据。下一步我们要做的就是统计这样的数据一共有多少行，也就得到了第1、2列的组合有多少组。<br />
<strong><span style="color:#ff0000;">③</span></strong>第三和第四行代码，就实现了统计数据行数的功能。参考本文前面部分的&ldquo;怎样统计数据行数&rdquo;一节。就明白这两句代码是什么意思了。<br />
这里需要特别说明的是：<br />
<strong><span style="color:#0000ff;">a)</span></strong>为什么倒数第二句代码中是COUNT(B)，而不是COUNT(group)？<br />
我们是对C进行FOREACH，所以要先看看C的数据结构：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;C;
C:&nbsp;{group:&nbsp;chararray,B:&nbsp;{group:&nbsp;(col1:&nbsp;int,col2:&nbsp;int),A:&nbsp;{col1:&nbsp;int,col2:&nbsp;int,col3:&nbsp;int,col4:&nbsp;int}}}
</code></pre>
</section>
<p>可见，你可以把C想像成一个map的结构，key是一个group，value是一个包（bag），它的名字是B，这个包中有N个元素，每一个元素都对应到②中所说的一行。根据②的分析，我们就是要统计B中元素的个数，因此，这里当然就是COUNT(B)了。<br />
<strong><span style="color:#0000ff;">b)</span></strong>COUNT函数的作用是统计一个包（bag）中的元素的个数：</p>
<blockquote>
<div>
		COUNT</div>
<div>
		Computes the number of elements in a bag.</div>
</blockquote>
<div>
	从C的数据结构看，B是一个bag，所以COUNT函数是可以用于它的。<br />
	如果你试图把COUNT应用于一个非bag的数据结构上，会发生错误，例如：、</p>
<div>
<blockquote>
<div>
				java.lang.ClassCastException: org.apache.pig.data.BinSedesTuple cannot be cast to org.apache.pig.data.DataBag</div>
</blockquote></div>
</div>
<p>这是把Tuple传给COUNT函数时发生的错误。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;两个整型数相除，如何转换为浮点型，从而得到正确的结果<br />
这个问题其实很傻，或许不用说你也知道了：假设有int a = 3 和 int b = 2两个数，在大多数编程语言里，a/b得到的是1，想得到正确结果1.5的话，需要转换为float再计算。在Pig中其实和这种情况一样，下面就拿几行数据来做个实验：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;a.txt&nbsp;</span>
3&nbsp;&nbsp;&nbsp;&nbsp;2
4&nbsp;&nbsp;&nbsp;&nbsp;5
</code></pre>
</section>
<p>在Pig中：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
grunt&gt;&nbsp;B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;col1/col2;
grunt&gt;&nbsp;DUMP&nbsp;B;
(1)
(0)
</code></pre>
</section>
<p>可见，不加类型转换的计算结果是取整之后的值。<br />
那么，转换一下试试：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
grunt&gt;&nbsp;B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;(float)(col1/col2);
grunt&gt;&nbsp;DUMP&nbsp;B;
(1.0)
(0.0)
</code></pre>
</section>
<p>这样转换还是不行的，这与大多数编程语言的结果一致&mdash;&mdash;它只是把取整之后的数再转换为浮点数，因此当然是不行的。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
正确的做法应该是：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);&nbsp;
grunt&gt;&nbsp;B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;(float)col1/col2;&nbsp;&nbsp;
grunt&gt;&nbsp;DUMP&nbsp;B;
(1.5)
(0.8)
</code></pre>
</section>
<p>或者这样也行：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
grunt&gt;&nbsp;B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;col1/(float)col2;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
grunt&gt;&nbsp;DUMP&nbsp;B;
(1.5)
(0.8)
</code></pre>
</section>
<p>这与我们的通常做法是一致的，因此，你要做除法运算的时候，需要注意这一点。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;UNION的一个例子<br />
假设有两个数据文件为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;1.txt&nbsp;</span>
0&nbsp;&nbsp;&nbsp;&nbsp;3
1&nbsp;&nbsp;&nbsp;&nbsp;5
0&nbsp;&nbsp;&nbsp;&nbsp;8

[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;2.txt&nbsp;</span>
1&nbsp;&nbsp;&nbsp;&nbsp;6
0&nbsp;&nbsp;&nbsp;&nbsp;9
</code></pre>
</section>
<p>现在要求出：在第一列相同的情况下，第二列的和分别为多少？<br />
例如，第一列为 1 的时候，第二列有5和6两个值，和为11。同理，第一列为0的时候，第二列的和为 3+8+9=20。<br />
计算此问题的Pig代码如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(a:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;b:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);&nbsp;
B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;2.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(c:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;d:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);&nbsp;
C&nbsp;=&nbsp;UNION&nbsp;A,&nbsp;B;
D&nbsp;=&nbsp;GROUP&nbsp;C&nbsp;BY&nbsp;$0;&nbsp;
E&nbsp;=&nbsp;FOREACH&nbsp;D&nbsp;GENERATE&nbsp;FLATTEN(group),&nbsp;SUM(C.$1);
DUMP&nbsp;E;
</code></pre>
</section>
<p>输出为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(0,20)
(1,11)
</code></pre>
</section>
<p><span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
我们来看看每一步分别做了什么：<br />
<strong><span style="color:#ff0000;">①</span></strong>第1行、第2行代码分别加载数据到关系A、B中，没什么好说的。<br />
<strong><span style="color:#ff0000;">②</span></strong>第3行代码，将关系A、B合并起来了。合并后的数据结构为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;C;
C:&nbsp;{a:&nbsp;int,b:&nbsp;int}
</code></pre>
</section>
<p>其数据为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;DUMP&nbsp;C;
(0,3)
(1,5)
(0,8)
(1,6)
(0,9)
</code></pre>
</section>
<p><strong><span style="color:#ff0000;">③</span></strong>第4行代码按第1列（即$0）进行分组，分组后的数据结构为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;D;
D:&nbsp;{group:&nbsp;int,C:&nbsp;{a:&nbsp;int,b:&nbsp;int}}
</code></pre>
</section>
<p>其数据为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;DUMP&nbsp;D;
(0,{(0,9),(0,3),(0,8)})
(1,{(1,5),(1,6)})
</code></pre>
</section>
<p><strong><span style="color:#ff0000;">④</span></strong>最后一行代码，遍历D，将D中每一行里的所有bag(即C)的第2列(即$1)进行累加，就得到了我们要的结果。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;错误&ldquo;<span style="color:#0000ff;">ERROR org.apache.pig.tools.grunt.Grunt - ERROR 2042: Error in new logical plan. Try -Dpig.usenewlogicalplan=false.</span>&rdquo;的可能原因<br />
<strong><span style="color:#ff0000;">①</span></strong>Pig的bug，详见<a href="https://issues.apache.org/jira/browse/PIG-1683" rel="noopener noreferrer" target="_blank">此链接</a>；<br />
<strong><span style="color:#ff0000;">②</span></strong>其他原因。我遇到并解决了一例。具体的代码不便在此陈列，但是基本可以说是由于自己写的Pig代码对复杂数据结构的处理不当导致的，后来我尝试更改了一种实现方式，就绕过了这个问题。关于这点，确实还是要具体问题具体分析的，在这里没有实例的话，无法给大家一个明确的解决问题的指南。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何在Pig中使用正则表达式对字符串进行匹配<br />
假设你有如下数据文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;a.txt&nbsp;</span>
1&nbsp;&nbsp;&nbsp;&nbsp;http://ui.qq.com/abcd.html
5&nbsp;&nbsp;&nbsp;&nbsp;http://tr.qq.com/743.html
8&nbsp;&nbsp;&nbsp;&nbsp;http://vid.163.com/trees.php
9&nbsp;&nbsp;&nbsp;&nbsp;http:auto.qq.com/us.php
</code></pre>
</section>
<p>现在要找出该文件中，第二列符合&ldquo;<span style="color:#ff0000;">*//*.qq.com/*</span>&rdquo;模式的所有行（此处只有前两行符合条件），怎么做？<br />
Pig代码如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);
B&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;col2&nbsp;matches&nbsp;&#39;.*//.*\\.qq\\.com/.*&#39;;&nbsp;&nbsp;
DUMP&nbsp;B;
</code></pre>
</section>
<p>我们看到，matches关键字对 col2 进行了正则匹配，它使用的是Java格式的正则表达式匹配规则。<br />
<span style="color:#ff0000;">.&nbsp;</span>表示任意字符，<strong><span style="color:#ff0000;">*&nbsp;</span></strong>表示字符出现任意次数；<strong><span style="color:#ff0000;">\.&nbsp;</span></strong>对&nbsp;<span style="color:#ff0000;">.&nbsp;</span>进行了转义，表示匹配&nbsp;<span style="color:#ff0000;">.&nbsp;</span>这个字符；<span style="color:#ff0000;">/&nbsp;</span>就是表示匹配&nbsp;<strong><span style="color:#ff0000;">/&nbsp;</span></strong>这个字符。<br />
这里需要注意的是，在引号中，用于转义的字符&nbsp;<strong><span style="color:#ff0000;">\&nbsp;</span></strong>需要打两个才能表示一个，所以上面的&nbsp;<strong><span style="color:#ff0000;">\\.&nbsp;</span></strong>就是与正则中的&nbsp;<strong><span style="color:#ff0000;">\.&nbsp;</span></strong>是一样的，即匹配<strong>&nbsp;</strong><span style="color:#ff0000;">.&nbsp;</span>这个字符。所以，如果你要匹配数字的话，应该用这种写法（<strong><span style="color:#ff0000;">\d</span></strong>表示匹配数字，在引号中必须用<strong><span style="color:#ff0000;">\\d</span></strong>）：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">B&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;(col&nbsp;matches&nbsp;&#39;\\d.*&#39;);
</code></pre>
</section>
<p><span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
最后输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(1,http://ui.qq.com/abcd.html)
(5,http://tr.qq.com/743.html)
</code></pre>
</section>
<p>可见结果是正确的。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何截取一个字符串中的某一段<br />
在处理数据时，如果你想提取出一个日期字符串的年份，例如提取出&ldquo;2011-10-26&rdquo;中的&ldquo;2011&rdquo;，可以用内置函数 <span style="color:#0000ff;">SUBSTRING</span> 来实现：</p>
<blockquote>
<div>
		<span style="font-size:20px;">SUBSTRING</span></div>
<div>
		Returns a substring from a given string.</div>
<div>
		<span style="font-size:18px;">Syntax</span></div>
<div>
		SUBSTRING(string, startIndex, stopIndex)</div>
</blockquote>
<p>下面举一个例子。假设有数据文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;a.txt&nbsp;</span>
2010-05-06&nbsp;&nbsp;&nbsp;&nbsp;abc
2008-06-18&nbsp;&nbsp;&nbsp;&nbsp;uio
2011-10-11&nbsp;&nbsp;&nbsp;&nbsp;tyr
2010-12-23&nbsp;&nbsp;&nbsp;&nbsp;fgh
2011-01-05&nbsp;&nbsp;&nbsp;&nbsp;vbn
</code></pre>
</section>
<p>第一列是日期，现在要找出所有不重复的年份有哪些，可以这样做：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(dateStr:&nbsp;chararray,&nbsp;flag:&nbsp;chararray);
B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;SUBSTRING(dateStr,&nbsp;0,&nbsp;4);
C&nbsp;=&nbsp;DISTINCT&nbsp;B;
DUMP&nbsp;C;
</code></pre>
</section>
<p>输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(2008)
(2010)
(2011)
</code></pre>
</section>
<p>可见达到了我们想要的效果。<br />
上面的代码太简单了，不必多言，唯一需要说明一下的是 SUBSTRING 函数，它的第一个参数是要截取的字符串，第二个参数是起始索引（从0开始），第三个参数是结束索引。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何拼接两个字符串<br />
假设有以下数据文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;1.txt&nbsp;</span>
abc&nbsp;&nbsp;&nbsp;&nbsp;123
cde&nbsp;&nbsp;&nbsp;&nbsp;456
fgh&nbsp;&nbsp;&nbsp;&nbsp;789
ijk&nbsp;&nbsp;&nbsp;&nbsp;200
</code></pre>
</section>
<p>现在要把第一列和第二列作为字符串拼接起来，例如第一行会变成&ldquo;abc123&rdquo;，那么使用CONCAT这个求值函数（eval function）就可以做到：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;chararray,&nbsp;col2:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;CONCAT(col1,&nbsp;(chararray)col2);
DUMP&nbsp;B;
</code></pre>
</section>
<p>输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(abc123)
(cde456)
(fgh789)
(ijk200)
</code></pre>
</section>
<p>注意这里故意在加载数据的时候把第二列指定为int类型，这是为了说明数据类型不一致的时候CONCAT会出错（你可以试验一下）：</p>
<blockquote>
<p>
		ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1045: Could not infer the matching function for org.apache.pig.builtin.CONCAT as multiple or none of them fit. Please use an explicit cast.</p>
</blockquote>
<p>所以在后面CONCAT的时候，对第二列进行了类型转换。<br />
另外，如果数据文件内容为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;1.txt&nbsp;</span>
5&nbsp;&nbsp;&nbsp;&nbsp;123
7&nbsp;&nbsp;&nbsp;&nbsp;456
8&nbsp;&nbsp;&nbsp;&nbsp;789
0&nbsp;&nbsp;&nbsp;&nbsp;200
</code></pre>
</section>
<p>那么，如果对两列整数CONCAT：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;CONCAT(col1,&nbsp;col2);
</code></pre>
</section>
<p><span style="color:#ff0000;">同样也会出错</span>：</p>
<blockquote>
<p>
		ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1045: Could not infer the matching function for org.apache.pig.builtin.CONCAT as multiple or none of them fit. Please use an explicit cast.</p>
</blockquote>
<p>要注意这一点。<br />
有人可能会问：要拼接几个字符串的话怎么办？CONCAT 套 CONCAT 就要可以了（有点笨，但管用）： <span style="color:#0000ff;">CONCAT(a, CONCAT(b, c))</span></p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何求两个数据集的重合 &amp; 不同的数据类型JOIN会失败<br />
假设有以下两个数据文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;1.txt&nbsp;</span>
123
456
789
200
</code></pre>
</section>
<p>以及：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;2.txt&nbsp;</span>
200
333
789
</code></pre>
</section>
<p>现在要找出两个文件中，相同的数据有多少行，怎么做？这也就是所谓的求两个数据集的<span style="color:#ff0000;">重合</span>。<br />
用关系操作符JOIN，我们可以达到这个目的。在处理海量数据时，经常会有求重合的需求。所以JOIN是Pig中一个极其重要的操作。<br />
在本例中，两个文件中有两个相同的数据行：789以及200，因此，结果应该是2。<br />
我们先来看看正确的代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(a:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);&nbsp;&nbsp;&nbsp;
B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;2.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(b:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
C&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;a,&nbsp;B&nbsp;BY&nbsp;b;
D&nbsp;=&nbsp;GROUP&nbsp;C&nbsp;ALL;
E&nbsp;=&nbsp;FOREACH&nbsp;D&nbsp;GENERATE&nbsp;COUNT(C);
DUMP&nbsp;E;
</code></pre>
</section>
<p>解释一下：<br />
<span style="color:#ff0000;">①</span>第一、二行是加载数据，不必多言。<br />
<span style="color:#ff0000;">②</span>第三行按A的第1列、B的第二列进行&ldquo;结合&rdquo;，JOIN之后，a、b两列不相同的数据就被剔除掉了。C的数据结构为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">C:&nbsp;{A::a:&nbsp;int,B::b:&nbsp;int}
</code></pre>
</section>
<p>C的数据为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(200,200)
(789,789)
</code></pre>
</section>
<p><span style="color:#ff0000;">③</span>由于我们要统计的是数据行数，所以上面的Pig代码中的第4、5行就进行了计数的运算。<br />
<span style="color:#ff0000;">④</span>如果文件 2.txt 多一行数据&ldquo;200&rdquo;，结果会是什么？答案是：结果为3行。这个时候C的数据为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(200,200)
(200,200)
(789,789)
</code></pre>
</section>
<p>所以如果你要去除重复的，还需要用DISTINCE对C处理一下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(a:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;2.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(b:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
C&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;a,&nbsp;B&nbsp;BY&nbsp;b;
uniq_C&nbsp;=&nbsp;DISTINCT&nbsp;C;
D&nbsp;=&nbsp;GROUP&nbsp;uniq_C&nbsp;ALL;
E&nbsp;=&nbsp;FOREACH&nbsp;D&nbsp;GENERATE&nbsp;COUNT(uniq_C);
DUMP&nbsp;E;
</code></pre>
</section>
<p>这样得到的结果就是2了。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
尤其需要注意的是，如果JOIN的两列具有不同的数据类型，是会失败的。例如以下代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(a:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);&nbsp;&nbsp;&nbsp;
B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;2.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(b:&nbsp;chararray);
C&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;a,&nbsp;B&nbsp;BY&nbsp;b;
D&nbsp;=&nbsp;GROUP&nbsp;C&nbsp;ALL;
E&nbsp;=&nbsp;FOREACH&nbsp;D&nbsp;GENERATE&nbsp;COUNT(C);
DUMP&nbsp;E;
</code></pre>
</section>
<div>
	在语法上是没有错误的，但是一运行就会报错：</div>
<blockquote>
<div>
		ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1107: Cannot merge join keys, incompatible types</div>
</blockquote>
<div>
	这是因为a、b具有不同的类型：int和chararray。</p>
<p>	<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;使用三目运算符&ldquo; <span style="color:#ff0000;">? :</span> &rdquo;有时候必须加括号<br />
	假设有以下数据文件：</div>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;a.txt&nbsp;</span>
5&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;9
6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0
4&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;1
</code></pre>
</section>
<p>其中，第二行的第二列数据是有缺失的，因此，加载数据之后，它会成为null。顺便废话一句，在处理海量数据时，数据有缺失是经常遇到的现象。<br />
现在，我们如果要把所有缺失的数据填为 -1， 可以使用三目运算符来操作：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col3:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;col1,&nbsp;((col2&nbsp;is&nbsp;null)?&nbsp;-1&nbsp;:&nbsp;col2),&nbsp;col3;
DUMP&nbsp;B;
</code></pre>
</section>
<p>输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(5,8,9)
(6,-1,0)
(4,3,1)
</code></pre>
</section>
<p><span style="color:#0000ff;">((col2 is null)? -1 : col2)</span> 的含义不用解释你也知道，就是当col2为null的时候将其置为-1，否则就保持原来的值，但是注意，它最外面是用括号括起来的，如果去掉括号，写成&nbsp;<span style="color:#0000ff;">(col2 is null)? -1 : col2</span>，那么就会有语法错误：</p>
<blockquote>
<div>
		ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1000: Error during parsing. Encountered &quot; &quot;is&quot; &quot;is &quot;&quot; at line 1, column 36.</div>
<div>
		Was expecting one of （后面省略）</div>
</blockquote>
<div>
	错误提示有点不直观。所以，有时候使用三目运算符是必须要使用括号的。</p>
<p>	<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何补上缺失的数据<br />
	通过前面的文章，我们已经知道了如何按自己的需求补上缺失的数据，那么这里还有一个例子，可以让你多了解一些特殊的情况。<br />
	数据文件如下：</div>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;1.txt&nbsp;</span>
1&nbsp;&nbsp;&nbsp;&nbsp;(4,9)
5&nbsp;&nbsp;&nbsp;&nbsp;
8&nbsp;&nbsp;&nbsp;&nbsp;(3,0)
5&nbsp;&nbsp;&nbsp;&nbsp;(9,2)
6&nbsp;&nbsp;&nbsp;&nbsp;
</code></pre>
</section>
<p>这些数据的布局比较怪，我们要把它加载成什么样的schema呢？答：第一列为一个int，第二列为一个tuple，此tuple又含两个int。加载成这样的模式不是为了制造复杂度，而是为了说明后面的问题而设计的。<br />
同时，我们也注意到，第二列数据是有缺失的。<br />
问题：怎样求在第一列数据相同的情况下，第二列数据中的第一个整数的和分别为多少？<br />
例如，第一列为1的数据只有一行（即第一行），因此，第二列的第一个整数的和就是4。<br />
但是对最后一行，也就是第一列为6时，由于其第二列数据缺失，我们希望它输出的结果是0。<br />
先来看看Pig代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(a:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;b:tuple(x:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;y:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>));
B&nbsp;=&nbsp;FOREACH&nbsp;A&nbsp;GENERATE&nbsp;a,&nbsp;FLATTEN(b);
C&nbsp;=&nbsp;GROUP&nbsp;B&nbsp;BY&nbsp;a;
D&nbsp;=&nbsp;FOREACH&nbsp;C&nbsp;GENERATE&nbsp;group,&nbsp;SUM(B.x);
DUMP&nbsp;D;
</code></pre>
</section>
<p>结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(1,4)
(5,9)
(6,)
(8,3)
</code></pre>
</section>
<p>我们注意到，(5,9) 这一行是由数据文件 1.txt 的第 2、4行计算得到的，其中，第2行数据有缺失，但这并不影响求和计算，因为另一行数据没有缺失。你可以这样想：一个包（bag）中有多个数，当其中一个为null，而其他不为null时，把它们相加会自动忽略null。<br />
然而，第三行 (6,) 是不是太刺眼了？没错，因为数据文件 1.txt 的最后一行缺失了第二列，所以，在 SUM(B.x) 中的 B.x 为null就会导致计算结果为null，从而什么也输出不了。<br />
这就与我们期望的输出有点不同了。我们希望这种缺失的数据不要空着，而是输出0。该怎么做呢？<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="color:#ff0000;">想法1</span>：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">D&nbsp;=&nbsp;FOREACH&nbsp;C&nbsp;GENERATE&nbsp;group,&nbsp;((IsEmpty(B.x))&nbsp;?&nbsp;0&nbsp;:&nbsp;SUM(B.x));
</code></pre>
</section>
<p>输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(1,4)
(5,9)
(6,)
(8,3)
</code></pre>
</section>
<p>可见行不通。从这个结果我们知道，IsEmpty(B.x) 为false，即B.x不是empty的，所以不能这样做。<br />
<span style="color:#ff0000;">想法2</span>：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">D&nbsp;=&nbsp;FOREACH&nbsp;C&nbsp;GENERATE&nbsp;group,&nbsp;((B.x&nbsp;is&nbsp;null)&nbsp;?&nbsp;0&nbsp;:&nbsp;SUM(B.x));
</code></pre>
</section>
<p>输出结果还是与上面一样！仍然行不通。这更奇怪了：B.x既非empty，也非null，那么它是什么情况？按照我的理解，当group为6时，它应该是一个非空的包（bag），里面有一个null的东西，所以，这个包不是empty的，它也非null。我不知道这样理解是否正确，但是它看上去就像是这样的。<br />
<span style="color:#ff0000;">想法3</span>：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">D&nbsp;=&nbsp;FOREACH&nbsp;C&nbsp;GENERATE&nbsp;group,&nbsp;SUM(B.x)&nbsp;AS&nbsp;s;
E&nbsp;=&nbsp;FOREACH&nbsp;D&nbsp;GENERATE&nbsp;group,&nbsp;((s&nbsp;is&nbsp;null)&nbsp;?&nbsp;-1&nbsp;:&nbsp;s);
DUMP&nbsp;E;
</code></pre>
</section>
<p>输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(1,4)
(5,9)
(6,-1)
(8,3)
</code></pre>
</section>
<p>可见达到了我们想要的结果。这与本文前面部分的做法是一致的，即：先得到含null的结果，再把这个结果中的null替换为指定的值。<br />
有人会问：就没有办法在生成数据集D的时候，就直接通过判断语句来实现这个效果吗？据我目前所知是不行的，如果哪位读者知道，不妨告知。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;DISTINCT操作用于去重，正因为它要把数据集合到一起，才知道哪些数据是重复的，因此，它会产生reduce过程。同时，在map阶段，它也会利用combiner来先去除一部分重复数据以加快处理速度。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何将Pig job的优先级设为HIGH<br />
嫌Pig job运行太慢？只需在Pig脚本的开头加上一句：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">set</span>&nbsp;job.priority&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">HIGH</span>;
</code></pre>
</section>
<p>即可将Pig job的优先级设为高了。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;&ldquo;<span style="color:#ff0000;">Scalars can be only used with projections</span>&rdquo;错误的原因<br />
这个错误提示比较不直观，光看这句话是不容易发现错误所在的，但是，只要你一Google，可能就找到原因了，例如<a href="https://issues.apache.org/jira/browse/PIG-1788" rel="noopener noreferrer" target="_blank">这个链接</a>里的反馈。<br />
在这里，我也想用一个简单的例子给大家用演示一下产生这个错误的原因之一。<br />
假设有如下数据文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]$&nbsp;cat&nbsp;1.txt&nbsp;
a&nbsp;&nbsp;&nbsp;&nbsp;1
b&nbsp;&nbsp;&nbsp;&nbsp;8
c&nbsp;&nbsp;&nbsp;&nbsp;3
c&nbsp;&nbsp;&nbsp;&nbsp;3
d&nbsp;&nbsp;&nbsp;&nbsp;6
d&nbsp;&nbsp;&nbsp;&nbsp;3
c&nbsp;&nbsp;&nbsp;&nbsp;5
e&nbsp;&nbsp;&nbsp;&nbsp;7
</code></pre>
</section>
<p>现在要统计：在第1列的每一种组合下，第二列为3和6的数据分别有多少条？<br />
例如，当第1列为 c 时，第二列为3的数据有2条，为6的数据有0条；当第1列为d时，第二列为3的数据有1条，为6的数据有1条。其他的依此类推。<br />
Pig代码如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:chararray,&nbsp;col2:<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;col1;
C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;{
&nbsp;&nbsp;&nbsp;&nbsp;D&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;col2&nbsp;==&nbsp;3;
&nbsp;&nbsp;&nbsp;&nbsp;E&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;col2&nbsp;==&nbsp;6;
&nbsp;&nbsp;&nbsp;&nbsp;GENERATE&nbsp;group,&nbsp;COUNT(D),&nbsp;COUNT(E);
};
DUMP&nbsp;C;
</code></pre>
</section>
<p>输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(a,0,0)
(b,0,0)
(c,2,0)
(d,1,1)
(e,0,0)
</code></pre>
</section>
<p>可见结果是正确的。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
那么，如果我在上面的代码中，把&ldquo;D = FILTER <span style="color:#ff0000;">A</span> BY col2 == 3&rdquo;不小心写成了&ldquo;D = FILTER <span style="color:#ff0000;">B</span> BY col2 == 3&rdquo;，就肯定会得到&ldquo;<span style="color:#0000ff;">Scalars can be only used with projections</span>&rdquo;的错误提示。<br />
说白了，还是要时刻注意你每一步生成的数据的结构，眼睛睁大，千万不要用错了relation。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;什么是嵌套的FOREACH/内部的FOREACH<br />
嵌套的（nested）FOREACH和内部的（inner）FOREACH是一个意思，正如你在本文第(35)条中所见，一个FOREACH可以对每一条记录施以多种不同的关系操作，然后再GENERATE得到想要的结果，这就是嵌套的/内部的FOREACH。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;错误&ldquo;Could not infer the matching function for org.apache.pig.builtin.CONCAT&rdquo;的原因之一<br />
如果你遇到这个错误，那么有可能是你在多级CONCAT嵌套的时候，没有写对语句，例如&ldquo;CONCAT(CONCAT(CONCAT(a, b), c), d)&rdquo;这样的嵌套，由于括号众多，所以写错了是一点也不奇怪的。我遇这个错误的时候，是由于CONCAT太多，自己多写了一个都没有发现。希望我的提醒能给你一点解决问题的提示。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;用Pig加载HBase数据时遇到的错误&ldquo;<span style="color:#0000ff;">ERROR 2999: Unexpected internal error. could not instantiate &#39;com.twitter.elephantbird.pig.load.HBaseLoader&#39; with arguments XXX</span>&rdquo;的原因之一<br />
请看这个链接：《<a href="http://www.codelast.com/?p=4249" rel="noopener noreferrer" target="_blank"><span style="color: rgb(0, 0, 255); ">Apache Pig中文教程（进阶）</span></a>》</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;错误&ldquo;<span style="color:#0000ff;">ERROR 1039: In alias XX, incompatible types in EqualTo Operator left hand side:XXX right hand side:XXX</span>&rdquo;的原因<br />
其实这个错误提示太明显了，就是类型不匹配造成的。上面的XXX可以指代不同的类型。<br />
这说明，前面可能有一个类型为long的字段，后面你却把它当chararray来用了，例如：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">long</span>);
B&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;col2&nbsp;==&nbsp;&#39;123456789&#39;;
C&nbsp;=&nbsp;GROUP&nbsp;B&nbsp;ALL;
D&nbsp;=&nbsp;FOREACH&nbsp;C&nbsp;GENERATE&nbsp;COUNT(B);
DUMP&nbsp;D;
</code></pre>
</section>
<p>就会出错：</p>
<blockquote>
<p>
		ERROR 1039: In alias B, incompatible types in EqualTo Operator left hand side:long right hand side:chararray</p>
</blockquote>
<p>只要把col2强制类型转换一下（或者一开始就将其类型指定为chararray）就可以解决问题。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
不仅在进行数据比较中，在JOIN时也经常出现数据类型不匹配导致的错误问题。我在实际工作中发现，有的同学写了比较长的Pig代码，出现了这样的错误却不会仔细去看错误提示，而是绞尽脑汁地逐句去检查语法（语法是没有错的），结果费了很大的劲才知道是类型问题，得不偿失，还不如仔细看错误提示想想为什么。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;在grunt交互模式下，如何在编辑Pig代码的时候跳到行首和行末/行尾<br />
在grunt模式下，如果你写了一句超长的Pig代码，那么，你想通过HOME/END键跳到行首和行末是做不到的。<br />
按HOME时，Pig会在你的光标处插入一个&ldquo;1~&rdquo;：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">col</span>:&nbsp;int1~);
</code></pre>
</section>
<p>按END时，Pig会在你的光标处插入一个&ldquo;4~&rdquo;：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">col</span>:&nbsp;int4~);
</code></pre>
</section>
<p>正确的做法是：按Ctrl+A 和 Ctrl+E 代替 HOME 和 END，就可以跳到行首和行末了。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;不能对同一个关系（relation）进行JOIN<br />
假设有如下文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;1.txt&nbsp;</span>
1&nbsp;&nbsp;&nbsp;&nbsp;a
2&nbsp;&nbsp;&nbsp;&nbsp;e
3&nbsp;&nbsp;&nbsp;&nbsp;v
4&nbsp;&nbsp;&nbsp;&nbsp;n
</code></pre>
</section>
<p>我想对第一列这样JOIN：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);&nbsp;
B&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;col1,&nbsp;A&nbsp;BY&nbsp;col1;
</code></pre>
</section>
<p>那么当你试图 DUMP B 的时候，会报如下的错：</p>
<blockquote>
<p>
		ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1108: Duplicate schema alias: A::col1 in &quot;B&quot;</p>
</blockquote>
<p>这是因为Pig会弄不清JOIN之后的字段名&mdash;&mdash;两个字段均为A::col1，使得一个关系（relation）中出现了重复的名字，这是不允许的。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
要解决这个问题，只需将数据LOAD两次，并且给它们起不同的名字就可以了：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);
grunt&gt;&nbsp;B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);
grunt&gt;&nbsp;C&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;col1,&nbsp;B&nbsp;BY&nbsp;col1;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;C;
C:&nbsp;{A::col1:&nbsp;int,A::col2:&nbsp;chararray,B::col1:&nbsp;int,B::col2:&nbsp;chararray}
grunt&gt;&nbsp;DUMP&nbsp;C;
(1,a,1,a)
(2,e,2,e)
(3,v,3,v)
(4,n,4,n)
</code></pre>
</section>
<p>从上面的 C 的schema，你可以看出来，如果对同一个关系A的第一列进行JOIN，会导致schema中出现相同的字段名，所以当然会出错。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;外部的JOIN(outer JOIN)<br />
初次使用JOIN时，一般人使用的都是所谓的&ldquo;内部的JOIN&rdquo;(inner JOIN)，也即类似于 C = JOIN A BY col1, B BY col2 这样的JOIN。Pig也支持&ldquo;外部的JOIN&rdquo;(outer JOIN)，下面就举一个例子。<br />
假设有文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;1.txt&nbsp;</span>
1&nbsp;&nbsp;&nbsp;&nbsp;a
2&nbsp;&nbsp;&nbsp;&nbsp;e
3&nbsp;&nbsp;&nbsp;&nbsp;v
4&nbsp;&nbsp;&nbsp;&nbsp;n
</code></pre>
</section>
<p>以及：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;2.txt&nbsp;</span>
9&nbsp;&nbsp;&nbsp;&nbsp;a
2&nbsp;&nbsp;&nbsp;&nbsp;e
3&nbsp;&nbsp;&nbsp;&nbsp;v
0&nbsp;&nbsp;&nbsp;&nbsp;n
</code></pre>
</section>
<p>现在来对这两个文件的第一列作一个outer JOIN：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);
grunt&gt;&nbsp;B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;2.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);
grunt&gt;&nbsp;C&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;col1&nbsp;LEFT&nbsp;OUTER,&nbsp;B&nbsp;BY&nbsp;col1;
grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;C;
C:&nbsp;{A::col1:&nbsp;int,A::col2:&nbsp;chararray,B::col1:&nbsp;int,B::col2:&nbsp;chararray}
grunt&gt;&nbsp;DUMP&nbsp;C;
(1,a,,)
(2,e,2,e)
(3,v,3,v)
(4,n,,)
</code></pre>
</section>
<p>在outer JOIN中，&ldquo;OUTER&rdquo;关键字是可以省略的。从上面的结果，我们注意到：如果换成一个inner JOIN，则两个输入文件的第一、第四行都不会出现在结果中（因为它们的第一列不相同），而在LEFT OUTER JOIN中，文件1.txt的第一、四行却被输出了，所以这就是LEFT OUTER JOIN的特点：<span style="color:#800080;">对左边的记录来说，即使<span class="KSFIND_CLASS_SELECT" id="0KSFindDIV">它与右边的记录不匹配，它也会被</span>包含在输出数据中</span>。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
同理可知RIGHT OUTER JOIN的功能&mdash;&mdash;把上面的 LEFT 换成 RIGHT，结果如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(,,0,n)
(2,e,2,e)
(3,v,3,v)
(,,9,a)
</code></pre>
</section>
<p>可见，与左边的记录不匹配的右边的记录被保存了下来，而左边的记录没有保存下来（两个逗号表明其为空），这就是RIGHT OUTER JOIN的效果，与我们想像的一样。<br />
有人会问，OUTER JOIN在实际中可以用来做什么？举一个例子：可以用来求&ldquo;<span style="color:#a52a2a;">不在某数据集中的那些数据（即：不重合的数据）</span>&rdquo;。还是以上面的两个数据文件为例，现在我要求出 1.txt 中，第一列不在 2.txt 中的第一列的那些记录，肉眼一看就知道，1和4这两个数字在 2.txt 的第一列里没有出现，而2和3出现了，因此，我们要找的记录就是：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">1&nbsp;&nbsp;&nbsp;&nbsp;a
4&nbsp;&nbsp;&nbsp;&nbsp;n
</code></pre>
</section>
<p>要实现这个效果，Pig代码及结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);
grunt&gt;&nbsp;B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;2.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray);
grunt&gt;&nbsp;C&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;col1&nbsp;LEFT&nbsp;OUTER,&nbsp;B&nbsp;BY&nbsp;col1;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
grunt&gt;&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">DESCRIBE</span>&nbsp;C;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
C:&nbsp;{A::col1:&nbsp;int,A::col2:&nbsp;chararray,B::col1:&nbsp;int,B::col2:&nbsp;chararray}
grunt&gt;&nbsp;D&nbsp;=&nbsp;FILTER&nbsp;C&nbsp;BY&nbsp;(B::col1&nbsp;is&nbsp;null);&nbsp;
grunt&gt;&nbsp;E&nbsp;=&nbsp;FOREACH&nbsp;D&nbsp;GENERATE&nbsp;A::col1&nbsp;AS&nbsp;col1,&nbsp;A::col2&nbsp;AS&nbsp;col2;
grunt&gt;&nbsp;DUMP&nbsp;E;
(1,a)
(4,n)
</code></pre>
</section>
<p>可见，我们确实找出了&ldquo;不重合的记录&rdquo;。在作海量数据分析时，这种功能是极为有用的。<br />
最后来一个总结：<br />
假设有两个数据集（在1.txt和2.txt中），分别都只有1列，则如下代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;chararray);
B&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;2.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;chararray);&nbsp;&nbsp;
C&nbsp;=&nbsp;JOIN&nbsp;A&nbsp;BY&nbsp;col1&nbsp;LEFT&nbsp;OUTER,&nbsp;B&nbsp;BY&nbsp;col1;
D&nbsp;=&nbsp;FILTER&nbsp;C&nbsp;BY&nbsp;(B::col1&nbsp;is&nbsp;null);
E&nbsp;=&nbsp;FOREACH&nbsp;D&nbsp;GENERATE&nbsp;A::col1&nbsp;AS&nbsp;col1;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
DUMP&nbsp;E;
</code></pre>
</section>
<p>计算结果为：在A中，但不在B中的记录。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;JOIN的优化<br />
请看这个链接：《<a href="http://www.codelast.com/?p=4249" rel="noopener noreferrer" target="_blank"><span style="color: rgb(0, 0, 255); ">Apache Pig中文教程（进阶）</span></a>》</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;GROUP时按所有字段分组可以用GROUP ALL吗<br />
假设你有如下数据文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]<span class="hljs-comment" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(128, 128, 128); word-wrap: inherit !important; word-break: inherit !important;">#&nbsp;cat&nbsp;3.txt&nbsp;</span>
1&nbsp;&nbsp;&nbsp;&nbsp;9
2&nbsp;&nbsp;&nbsp;&nbsp;2
3&nbsp;&nbsp;&nbsp;&nbsp;3
4&nbsp;&nbsp;&nbsp;&nbsp;0
1&nbsp;&nbsp;&nbsp;&nbsp;9
1&nbsp;&nbsp;&nbsp;&nbsp;9
4&nbsp;&nbsp;&nbsp;&nbsp;0
</code></pre>
</section>
<p>现在要找出第1、2列的组合中，每一种的个数分别为多少，例如，(1,9)组合有3个，(4,0)组合有两个，依此类推。<br />
显而易见，我们只需要用GROUP就可以轻易完成这个任务。于是写出如下代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;3.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;ALL;
C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;GENERATE&nbsp;group,&nbsp;COUNT(A);
DUMP&nbsp;C;
</code></pre>
</section>
<p>可惜，结果不是我们想要的：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(all,7)
</code></pre>
</section>
<p>为什么呢？我们的本意是按所有列来GROUP，于是使用了GROUP ALL，但是这实际上变成了统计行数，下面的代码就是一段标准的统计数据行数的代码：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;3.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;ALL;
C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;GENERATE&nbsp;COUNT(A);
DUMP&nbsp;C;
</code></pre>
</section>
<p>因此，上面的&nbsp;<span style="color:#0000ff;">C = FOREACH B GENERATE group, COUNT(A)</span> 也无非就是多打印了一个group的名字（<span style="color:#ff0000;">all</span>）而已&mdash;&mdash;group的名字被设置为&ldquo;<span style="color:#ff0000;">all</span>&rdquo;，这是Pig帮你做的。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
正确的做法很简单，只需要按所有字段GROUP，就可以了：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;3.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>);
B&nbsp;=&nbsp;GROUP&nbsp;A&nbsp;BY&nbsp;(col1,&nbsp;col2);
C&nbsp;=&nbsp;FOREACH&nbsp;B&nbsp;GENERATE&nbsp;group,&nbsp;COUNT(A);
DUMP&nbsp;C;
</code></pre>
</section>
<p>结果如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">((1,9),3)
((2,2),1)
((3,3),1)
((4,0),2)
</code></pre>
</section>
<p>这与我们前面分析的正确结果是一样的。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;在Pig中使用中文字符串<br />
有读者来信问我，<span style="color:#800080;">如何在Pig中使用中文作为FILTER的条件</span>？我做了如下测试，结论是可以使用中文。<br />
数据文件 data.txt 内容为（每一列之间以TAB为分隔符）：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">1&nbsp;&nbsp;&nbsp;&nbsp;北京市&nbsp;a
2&nbsp;&nbsp;&nbsp;&nbsp;上海市&nbsp;b
3&nbsp;&nbsp;&nbsp;&nbsp;北京市&nbsp;c
4&nbsp;&nbsp;&nbsp;&nbsp;北京市&nbsp;f
5&nbsp;&nbsp;&nbsp;&nbsp;天津市&nbsp;e
</code></pre>
</section>
<p>Pig脚本文件 test.pig 内容为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;data.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray,&nbsp;col3:&nbsp;chararray);
B&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;(col2&nbsp;==&nbsp;&#39;北京市&#39;);
DUMP&nbsp;B;
</code></pre>
</section>
<p>首先，我这两个文件的编码都是<span style="color:#ff0000;">UTF-8(无BOM)</span>，在Linux命令行下，我直接以本地模式执行Pig脚本 test.pig：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">pig&nbsp;-x&nbsp;local&nbsp;test.pig
</code></pre>
</section>
<p>得到的输出结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(1,北京市,a)
(3,北京市,c)
(4,北京市,f)
</code></pre>
</section>
<p>可见结果是正确的。<br />
<span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="color:#0000ff;">但是</span>，如果我在grunt交互模式下，把 test.pig 的内容粘贴进去执行，是得不到任何输出结果的：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">grunt&gt;&nbsp;A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;data.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;<span class="hljs-built_in" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">int</span>,&nbsp;col2:&nbsp;chararray,&nbsp;col3:&nbsp;chararray);
grunt&gt;&nbsp;B&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;(col2&nbsp;==&nbsp;&#39;北京市&#39;);
grunt&gt;&nbsp;DUMP&nbsp;B;
</code></pre>
</section>
<p>具体原因我不清楚，但是至少有一点是肯定的：可以使用中文作为FILTER的条件，只要不在交互模式下执行你的Pig脚本即可。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何统计 tuple 中的 field 数，bag 中的 tuple 数，map 中的 key/value 组数<br />
一句话：用Pig内建的 <span style="color:#0000ff;">SIZE</span> 函数：</p>
<blockquote>
<p>
		Computes the number of elements based on any Pig data type.</p>
</blockquote>
<p>具体可看<a href="http://pig.apache.org/docs/r0.8.1/piglatin_ref2.html#SIZE" rel="noopener noreferrer" target="_blank">这个</a>链接。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;一个字符串为null，与它为空不一定等价<br />
在某些情况下，要获取&ldquo;不为空&rdquo;的字符串，仅仅用 is not null 来判断是不够的，还应该加上 SIZE(field_name) &gt; 0 的条件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">B&nbsp;=&nbsp;FILTER&nbsp;A&nbsp;BY&nbsp;(field_name&nbsp;is&nbsp;not&nbsp;null&nbsp;AND&nbsp;(SIZE(field_name)&nbsp;&gt;&nbsp;0L));
</code></pre>
</section>
<p>注意，这只是在某些情况下需要这样做，在一般情况下，仅用 is not null 来过滤就可以了。我并没有总结出特殊情况是哪些情况，我只能说我我不是第一次遇到此情况了，所以才有了这一个结论。<br />
注意上面使用的是&ldquo;0<span style="color:#ff0000;">L</span>&rdquo;，因为SIZE()返回的是long类型，如果不加L，在Pig0.10下会出现一个警告，例如：</p>
<blockquote>
<p>
		[main] WARN &nbsp;org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_LONG 1 time(s)</p>
</blockquote>
<p>
<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;Pig中的各operator（操作符），哪些会触发reduce过程<br />
<span style="color:#ff0000;">①</span>GROUP：由于GROUP操作会将所有具有相同key的记录收集到一起，所以数据如果正在map中处理的话，就会触发shuffle&rarr;reduce的过程。<br />
<span style="color: rgb(255, 0, 0); ">②</span>ORDER：由于需要将所有相等的记录收集到一起（才能排序），所以ORDER会触发reduce过程。同时，除了你写的那个Pig job之外，Pig还会添加一个额外的M-R job到你的数据流程中，因为Pig需要对你的数据集做采样，以确定数据的分布情况，从而解决数据分布严重不均的情况下job效率过于低下的问题。<br />
<span style="color: rgb(255, 0, 0); ">③</span>DISTINCT：由于需要将记录收集到一起，才能确定它们是不是重复的，因此DISTINCT会触发reduce过程。当然，DISTINCT也会利用combiner在map阶段就把重复的记录移除。<br />
<span style="color: rgb(255, 0, 0); ">④</span>JOIN：JOIN用于求重合，由于求重合的时候，需要将具有相同key的记录收集到一起，因此，JOIN会触发reduce过程。<br />
<span style="color: rgb(255, 0, 0); ">⑤</span>LIMIT：由于需要将记录收集到一起，才能统计出它返回的条数，因此，LIMIT会触发reduce过程。<br />
<span style="color: rgb(255, 0, 0); ">⑥</span>COGROUP：与GROUP类似（参看本文前面的部分），因此它会触发reduce过程。<br />
<font color="#ff0000">⑦</font>CROSS：计算两个或多个关系的叉积。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;如何统计一个字符串中包含的指定字符数<br />
这可以不算是个Pig的问题了，你可以把它认为是一个shell的问题。从本文前面部分我们已经知道，Pig中可以用 STREAM ... THROUGH 来调用shell进行辅助数据处理，所以在这我们也能这样干。<br />
假设有文本文件：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">[root@localhost&nbsp;~]$&nbsp;cat&nbsp;1.txt&nbsp;
123&nbsp;&nbsp;&nbsp;&nbsp;abcdef:243789174
456&nbsp;&nbsp;&nbsp;&nbsp;DFJKSDFJ:3646:555558888
789&nbsp;&nbsp;&nbsp;&nbsp;yKDSF:00000%0999:2343324:11111:33333
</code></pre>
</section>
<p>现在要统计：每一行中，第二列里所包含的冒号（&ldquo;:&rdquo;）分别为多少？代码如下：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;1.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;chararray,&nbsp;col2:&nbsp;chararray);
B&nbsp;=&nbsp;STREAM&nbsp;A&nbsp;THROUGH&nbsp;`awk&nbsp;-F&quot;:&quot;&nbsp;&#39;{print&nbsp;NF-1}&#39;`&nbsp;AS&nbsp;(colon_count:&nbsp;int);
DUMP&nbsp;B;
</code></pre>
</section>
<p>结果为：</p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">(1)
(2)
(4)
</code></pre>
</section>
<p><span style="color: rgb(255, 255, 255); ">文章来源：</span><a href="http://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255); ">http://www.codelast.com/</span></a><br />
<span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;UDF是区分大小写的<br />
因为UDF是由Java类来实现的，所以区分大小写，就这么简单。</p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;<span style="font-family: 文泉驿等宽微米黑;">设置Pig job的job name<br />
在Pig脚本开头加上一句：</span></p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;"><span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">set</span>&nbsp;job.name&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;My-Job-Name&#39;</span>;
</code></pre>
</section>
<p><span style="font-family: 文泉驿等宽微米黑;">那么，执行该Pig脚本之后，在Hadoop的Job Tracker中看到的&ldquo;Name&rdquo;就是&ldquo;My-Job-Name&rdquo;了。<br />
如果不设置，显示的name是类似于&ldquo;</span>Job6245768625829738970.jar<span style="font-family: 文泉驿等宽微米黑;">&rdquo;这样的东西，job多的时候完全没有标识度，建议一定要设置一个特殊的job name。</span></p>
<p><span style="background-color: rgb(0, 255, 0);">➤</span>&nbsp;<span style="font-family: 文泉驿等宽微米黑;">把纯文本转化为JSON<br />
假设输入文件 a.txt 内容为：</span></p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">1&nbsp;&nbsp;&nbsp;&nbsp;2
9&nbsp;&nbsp;&nbsp;&nbsp;8
</code></pre>
</section>
<p><span style="font-family: 文泉驿等宽微米黑;">则如下Pig代码将把它转化为JSON格式：</span></p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">A&nbsp;=&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">LOAD</span>&nbsp;<span class="hljs-string" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(238, 220, 112); word-wrap: inherit !important; word-break: inherit !important;">&#39;a.txt&#39;</span>&nbsp;<span class="hljs-keyword" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(248, 35, 117); word-wrap: inherit !important; word-break: inherit !important;">AS</span>&nbsp;(col1:&nbsp;chararray,&nbsp;col2:&nbsp;chararray);
B&nbsp;=&nbsp;STORE&nbsp;A&nbsp;INTO&nbsp;&#39;result&#39;&nbsp;USING&nbsp;JsonStorage();
</code></pre>
</section>
<p><span style="font-family: 文泉驿等宽微米黑;">查看输出文件的内容是：</span></p>
<section class="output_wrapper" id="output_wrapper_id" style="font-size: 16px; color: rgb(62, 62, 62); line-height: 1.6; letter-spacing: 0px; font-family: &quot;Helvetica Neue&quot;, Helvetica, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;">
<pre style="font-size: inherit; color: inherit; line-height: inherit; margin-top: 0px; margin-bottom: 0px; padding: 0px;">
<code class="sql language-sql hljs" style="margin: 0px 2px; line-height: 18px; font-size: 14px; letter-spacing: 0px; font-family: Consolas, Inconsolata, Courier, monospace; border-radius: 0px; color: rgb(169, 183, 198); background: rgb(40, 43, 46); padding: 0.5em; overflow-wrap: normal !important; word-break: normal !important; overflow: auto !important; display: -webkit-box !important;">{&quot;col1&quot;:&quot;1&quot;,&quot;col2&quot;:&quot;2&quot;}
{&quot;col1&quot;:&quot;9&quot;,&quot;col2&quot;:&quot;8&quot;}
</code></pre>
</section>
<p><span style="font-family: 文泉驿等宽微米黑;">可见，你LOAD输入数据时定义的字段名，就是输出文件中的JSON字段名。</span></p>
<p>
<span style="color: rgb(255, 255, 255);">文章来源：</span><a href="https://www.codelast.com/" rel="noopener noreferrer" target="_blank"><span style="color: rgb(255, 255, 255);">https://www.codelast.com/</span></a><br />
<span style="color: rgb(255, 0, 0);">➤➤</span>&nbsp;版权声明&nbsp;<span style="color: rgb(255, 0, 0);">➤➤</span>&nbsp;<br />
转载需注明出处：<u><a href="https://www.codelast.com/" rel="noopener noreferrer" target="_blank"><em><span style="color: rgb(0, 0, 255);"><strong style="font-size: 16px;"><span style="font-family: arial, helvetica, sans-serif;">codelast.com</span></strong></span></em></a></u>&nbsp;<br />
感谢关注我的微信公众号（微信扫一扫）：<br />
<img decoding="async" alt="wechat qrcode of codelast" src="https://www.codelast.com/codelast_wechat_qr_code.jpg" style="color: rgb(77, 77, 77); font-size: 13px; width: 200px; height: 200px;" /><br />
以及我的微信视频号：</p>
<p style="border: 0px; font-size: 13px; margin: 0px 0px 9px; outline: 0px; padding: 0px; color: rgb(77, 77, 77);">
	<img decoding="async" alt="" src="https://www.codelast.com/wechat_shipinhao_qr_code.jpg" style="text-align: center; width: 200px; height: 199px;" /></p>
<div id="KSFIND_MASK" style="background-color: rgb(0, 0, 0); opacity: 0.22; position: absolute !important; left: 0px !important; top: 0px !important; border: 0px none !important; padding: 0px !important; z-index: 1000000 !important; height: 0px; width: 0px; display: none; cursor: auto; ">
	&nbsp;</div>
]]></content:encoded>
					
					<wfw:commentRss>https://www.codelast.com/%e5%8e%9f%e5%88%9bpig%e4%b8%ad%e7%9a%84%e4%b8%80%e4%ba%9b%e5%9f%ba%e7%a1%80%e6%a6%82%e5%bf%b5%e6%80%bb%e7%bb%93/feed/</wfw:commentRss>
			<slash:comments>34</slash:comments>
		
		
			</item>
	</channel>
</rss>
