03_MapReduce框架原理_3.4 InputSplit 切片类(源码)

InputSplit 切片类 1.0 类的作用 InputSplit 他在逻辑上包含了提供给处理这个Inputsplit的Mapper的所有的key-value 1.1 抽象方法 1. public abstract long getLength() 2. public abstract String[] getLocations() 1. 功能说明获取 InputSplit对象的大小(Bytes) 支持根据 InputSplit 的size 来排序 1. 功能说明获取该切片存储节点的位置信息 1.2 FileSplit 实现类 1. 成员属性 1. private Path file 2. private long start 3. private long length 4. private String[] hosts 该切片所属文件的路径切片起始位置切片长度存储切片的hosts 1.3 CombineFileSplit 实现类为每个MapTask 提供一个InputSplit对象,包含了这个MapTask要处理的数据

点击查看InputSplit

// &#x5207;&#x7247;&#x7C7B;,&#x8868;&#x793A; &#x4E00;&#x4EFD;&#x88AB;Mapper&#x5904;&#x7406;&#x7684;&#x6570;&#x636E;
public abstract class InputSplit {

  // &#x83B7;&#x53D6;&#x5207;&#x7247;&#x5BF9;&#x8C61;&#x7684; &#x957F;&#x5EA6;(&#x5355;&#x4F4D;Bytes)
  public abstract long getLength() throws IOException, InterruptedException;

  // &#x83B7;&#x53D6;&#x5F53;&#x524D;&#x5207;&#x7247;&#x5BF9;&#x8C61;&#x7684; &#x5B58;&#x50A8;&#x4FE1;&#x606F;
  public abstract
    String[] getLocations() throws IOException, InterruptedException;

  // &#x83B7;&#x53D6;&#x6240;&#x6709;&#x5207;&#x7247;&#x5BF9;&#x8C61;&#x7684; &#x5B58;&#x50A8;&#x4FE1;&#x606F;
  public SplitLocationInfo[] getLocationInfo() throws IOException {
    return null;
  }
}

FileSplit对应的是一个输入文件，也就是说，如果用FileSplit对应的FileInputFormat作为输入格式，
那么即使文件特别小，也是作为一个单独的InputSplit来处理，而每一个InputSplit将会由一个独立的Mapper Task来处理。
在输入数据是由大量小文件组成的情形下，就会有同样大量的InputSplit，
从而需要同样大量的Mapper来处理，大量的Mapper Task创建销毁开销将是巨大的，甚至对集群来说，是灾难性的！

点击查看FileSplit

// &#x5207;&#x7247;&#x7C7B;,&#x8868;&#x793A; &#x4E00;&#x4EFD;&#x88AB;Mapper&#x5904;&#x7406;&#x7684;&#x6570;&#x636E;
// &#x4F5C;&#x4E3A; InputFormat&#x7684;getSplits&#x65B9;&#x6CD5;&#x7684;&#x8FD4;&#x56DE;&#x503C;
// &#x4F5C;&#x4E3A; InputFormat&#x7684;createRecordReader&#x65B9;&#x6CD5;&#x7684;&#x8F93;&#x5165;
// &#x6BCF;&#x4E2A;&#x5207;&#x7247; &#x5305;&#x542B;&#x6587;&#x4EF6;&#x7684;&#x4E00;&#x90E8;&#x5206; &#x6216;&#x8005;&#x6574;&#x4E2A;&#x6587;&#x4EF6;(&#x4E0D;&#x53EF;&#x5207;&#x5206;&#x6216;&#x8005; &#x6587;&#x4EF6;&#x5927;&#x5C0F;&#x5C0F;&#x4E8E;&#x5207;&#x7247;*1.1&#x65F6;)
public class FileSplit extends InputSplit implements Writable {
  private Path file; // &#x5207;&#x7247; &#x6240;&#x5C5E;&#x7684;&#x6587;&#x4EF6;&#x540D;&#x79F0;
  private long start;  // &#x5207;&#x7247;&#x5BF9;&#x5E94; &#x5728;&#x6587;&#x4EF6;&#x4E2D;&#x7684; &#x542F;&#x793A;&#x4F4D;&#x7F6E;
  private long length; // &#x5207;&#x7247;&#x957F;&#x5EA6;(&#x5B57;&#x8282;&#x6570;)
  private String[] hosts; // &#x5207;&#x7247; &#x6240;&#x5C5E; block&#x7684;&#x5B58;&#x50A8;host&#x4FE1;&#x606F;
  private SplitLocationInfo[] hostInfos;

  // &#x6784;&#x9020;&#x5668;
  public FileSplit() {}

  // &#x6784;&#x9020;&#x5668;
  public FileSplit(Path file, long start, long length, String[] hosts) {
    this.file = file;
    this.start = start;
    this.length = length;
    this.hosts = hosts;
  }

 // &#x6784;&#x9020;&#x5668;
 public FileSplit(Path file, long start, long length, String[] hosts,
     String[] inMemoryHosts) {
   this(file, start, length, hosts);
   hostInfos = new SplitLocationInfo[hosts.length];
   for (int i = 0; i < hosts.length; i++) {
     // because N will be tiny, scanning is probably faster than a HashSet
     boolean inMemory = false;
     for (String inMemoryHost : inMemoryHosts) {
       if (inMemoryHost.equals(hosts[i])) {
         inMemory = true;
         break;
       }
     }
     hostInfos[i] = new SplitLocationInfo(hosts[i], inMemory);
   }
 }

  /** The file containing this split's data. */
  public Path getPath() { return file; }

  /** The position of the first byte in the file to process. */
  public long getStart() { return start; }

  /** The number of bytes in the file to process. */
  @Override
  public long getLength() { return length; }

  @Override
  public String toString() { return file + ":" + start + "+" + length; }

  ////////////////////////////////////////////
  // Writable methods  &#x5E8F;&#x5217;&#x5316;&#x65B9;&#x6CD5;
  ////////////////////////////////////////////

  @Override
  public void write(DataOutput out) throws IOException {
    Text.writeString(out, file.toString());
    out.writeLong(start);
    out.writeLong(length);
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    file = new Path(Text.readString(in));
    start = in.readLong();
    length = in.readLong();
    hosts = null;
  }

  @Override
  public String[] getLocations() throws IOException {
    if (this.hosts == null) {
      return new String[]{};
    } else {
      return this.hosts;
    }
  }

  @Override
  @Evolving
  public SplitLocationInfo[] getLocationInfo() throws IOException {
    return hostInfos;
  }
}

CombineFileSplit是针对小文件的分片，它将一系列小文件封装在一个InputSplit内，这样一个Mapper就可以处理多个小文件。
可以有效的降低进程开销。与FileSplit类似，CombineFileSplit同样包含文件路径，分片起始位置，
分片大小和分片数据所在的host列表四个属性，只不过这些属性不再是一个值，而是一个列表。
需要注意的一点是，CombineFileSplit的getLength()方法，返回的是这一系列数据的数据的总长度。

点击查看CombineFileSplit

// &#x5207;&#x7247;&#x7C7B;,&#x8868;&#x793A; &#x4E00;&#x4EFD;&#x88AB;Mapper&#x5904;&#x7406;&#x7684;&#x6570;&#x636E;
// &#x4E00;&#x4E2A;&#x5207;&#x7247;&#x5BF9;&#x8C61;,&#x53EF;&#x4EE5;&#x5305;&#x542B;&#x591A;&#x4E2A;&#x6587;&#x4EF6;
public class CombineFileSplit extends InputSplit implements Writable {

  private Path[] paths;
  private long[] startoffset;
  private long[] lengths;
  private String[] locations;
  private long totLength;

  /**
   * default constructor
   */
  public CombineFileSplit() {}
  public CombineFileSplit(Path[] files, long[] start,
                          long[] lengths, String[] locations) {
    initSplit(files, start, lengths, locations);
  }

  public CombineFileSplit(Path[] files, long[] lengths) {
    long[] startoffset = new long[files.length];
    for (int i = 0; i < startoffset.length; i++) {
      startoffset[i] = 0;
    }
    String[] locations = new String[files.length];
    for (int i = 0; i < locations.length; i++) {
      locations[i] = "";
    }
    initSplit(files, startoffset, lengths, locations);
  }

  private void initSplit(Path[] files, long[] start,
                         long[] lengths, String[] locations) {
    this.startoffset = start;
    this.lengths = lengths;
    this.paths = files;
    this.totLength = 0;
    this.locations = locations;
    for(long length : lengths) {
      totLength += length;
    }
  }

  /**
   * Copy constructor
   */
  public CombineFileSplit(CombineFileSplit old) throws IOException {
    this(old.getPaths(), old.getStartOffsets(),
         old.getLengths(), old.getLocations());
  }

  public long getLength() {
    return totLength;
  }

  /** Returns an array containing the start offsets of the files in the split*/
  public long[] getStartOffsets() {
    return startoffset;
  }

  /** Returns an array containing the lengths of the files in the split*/
  public long[] getLengths() {
    return lengths;
  }

  /** Returns the start offset of the i<sup>th</sup> Path */
  public long getOffset(int i) {
    return startoffset[i];
  }

  /** Returns the length of the i<sup>th</sup> Path */
  public long getLength(int i) {
    return lengths[i];
  }

  /** Returns the number of Paths in the split */
  public int getNumPaths() {
    return paths.length;
  }

  /** Returns the i<sup>th</sup> Path */
  public Path getPath(int i) {
    return paths[i];
  }

  /** Returns all the Paths in the split */
  public Path[] getPaths() {
    return paths;
  }

  /** Returns all the Paths where this input-split resides */
  public String[] getLocations() throws IOException {
    return locations;
  }

  public void readFields(DataInput in) throws IOException {
    totLength = in.readLong();
    int arrLength = in.readInt();
    lengths = new long[arrLength];
    for(int i=0; i<arrlength;i++) 0 { lengths[i]="in.readLong();" } int fileslength="in.readInt();" paths="new" path[fileslength]; for(int i="0;" i<fileslength;i++) paths[i]="new" path(text.readstring(in)); arrlength="in.readInt();" startoffset="new" long[arrlength]; i<arrlength;i++) startoffset[i]="in.readLong();" public void write(dataoutput out) throws ioexception out.writelong(totlength); out.writeint(lengths.length); for(long length : lengths) out.writelong(length); out.writeint(paths.length); for(path p paths) text.writestring(out, p.tostring()); out.writeint(startoffset.length); startoffset) @override string tostring() stringbuffer sb="new" stringbuffer(); for (int < paths.length; i++) if (i="=" ) sb.append("paths:"); sb.append(paths[i].touri().getpath() + ":" "+" lengths[i]); paths.length -1) sb.append(","); (locations !="null)" locs ; locsb="new" locations.length; locsb.append(locations[i] ":"); sb.append(" locations:" "; "); return sb.tostring(); code></arrlength;i++)>

Original: https://www.cnblogs.com/bajiaotai/p/15708969.html
Author: 学而不思则罔！
Title: 03_MapReduce框架原理_3.4 InputSplit 切片类(源码)

原创文章受到原创版权保护。转载请注明出处：https://www.johngo689.com/522641/

转载文章受原作者版权保护。转载请注明原作者出处！

一	二	三	四	五	六	日
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

03_MapReduce框架原理_3.4 InputSplit 切片类(源码)

大家都在看