python 遍历hadoop, 跟指定列表对比 包含列表中值的取出。

import sys
import tstree

fname = 'high_freq_site.list'
tree = tstree.TernarySearchTrie()
tree.loadData(fname)

token = ''
counter = 0
post = []

# url, count, posttime
for line in sys.stdin:
    line = line.strip()
    arr = line.split()
    if len(arr) != 3:
        continue

    #print arr
    num = arr[1]
    url = arr[0]
    posttime = int(arr[2])

    if token == '':
        token = url
        counter = 0
        counter += int(num)
        post.append(posttime)
    elif token == url:
        counter += int(num)
        post.append(posttime)
    elif token != url:
        ret = tree.maxMatch(token)
        if ret and post:
            print '%s\t%s\t%s\t%s' % (ret, token, counter, min(post))

        token = url
        counter = 0
        counter += int(num)
        post = []

ret = tree.maxMatch(token)
if ret and post:
    print '%s\t%s\t%s\t%s' % (ret, token, counter, min(post))

class TSTNode(object):
    def __init__(self, splitchar):
        self.splitchar = splitchar
        self.data = None

        self.loNode = None
        self.eqNode = None
        self.hiNode = None

class TernarySearchTrie(object):
    def __init__(self):
        self.rootNode = None

    def loadData(self, fname):
        f = open(fname)
        while True:
            line = f.readline()
            if not line:
                break
            line = line.strip()
            node = self.addWord(line)
            if node:
                node.data = line
        f.close()

    def addWord(self, word):
        if not word:
            return None

        charIndex = 0
        if not self.rootNode:
            self.rootNode = TSTNode(word[0])

        currentNode = self.rootNode

        while True:
            charComp = ord(word[charIndex]) - ord(currentNode.splitchar)
            if charComp == 0:
                charIndex += 1
                if charIndex == len(word):
                    return currentNode
                if not currentNode.eqNode:
                    currentNode.eqNode = TSTNode(word[charIndex])
                currentNode = currentNode.eqNode
            elif charComp < 0:
                if not currentNode.loNode:
                    currentNode.loNode = TSTNode(word[charIndex])
                currentNode = currentNode.loNode
            else:
                if not currentNode.hiNode:
                    currentNode.hiNode = TSTNode(word[charIndex])
                currentNode = currentNode.hiNode

    def maxMatch(self, url):
        ret = None
        currentNode = self.rootNode
        charIndex = 0
        while currentNode:
            if charIndex >= len(url):
                break
            charComp = ord(url[charIndex]) - ord(currentNode.splitchar)
            if charComp == 0:
                charIndex += 1
                if currentNode.data:
                    ret = currentNode.data
                if charIndex == len(url):
                    return ret
                currentNode = currentNode.eqNode
            elif charComp < 0:
                currentNode = currentNode.loNode
            else:
                currentNode = currentNode.hiNode
        return ret

if __name__ == '__main__':
    import sys
    fname = 'high_freq_site.list'
    tree = TernarySearchTrie()
    tree.loadData(fname)

    for url in sys.stdin:
        url = url.strip()
        ret = tree.maxMatch(url)
        print ret

Original: https://www.cnblogs.com/i80386/p/5058584.html
Author: 雨渐渐
Title: python 遍历hadoop, 跟指定列表对比 包含列表中值的取出。

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/10823/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

免费咨询
免费咨询
扫码关注
扫码关注
联系站长

站长Johngo!

大数据和算法重度研究者!

持续产出大数据、算法、LeetCode干货,以及业界好资源!

2022012703491714

微信来撩,免费咨询:xiaozhu_tec

分享本页
返回顶部
最近整理资源【免费获取】:   👉 程序员最新必读书单  | 👏 互联网各方向面试题下载 | ✌️计算机核心资源汇总