码迷,mamicode.com
首页 > 编程语言 > 详细

【python】入门学习(十)

时间:2014-09-04 16:44:29      阅读:326      评论:0      收藏:0      [点我收藏+]

标签:style   blog   color   os   io   ar   for   art   div   

#入门学习系列的内容均是在学习《Python编程入门(第3版)》时的学习笔记

统计一个文本文档的信息,并输出出现频率最高的10个单词

#text.py
#保留的字符
keep = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p
        q,r,s,t,u,v,w,x,y,z, ,-,""}
#将文本规范化 
def normalize(s): 
    """Convert s to a normalized string."""
    result = ‘‘
    for c in s.lower():
        if c in keep:
            result += c
    return result

#获取文本基本信息
def file_stats(fname):
    """Print statistics for the given file."""
    s = open(fname,r).read()
    num_chars = len(s)
    num_lines = s.count(\n)
    num_words = len(normalize(s).split())
    print("The file %s has:" % fname)
    print("  %s characters" % num_chars)
    print("  %s lines" % num_lines)
    print("  %s words" % num_words)

#将字符串转化为字典
def make_freq_dict(s):
    """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""
    s = normalize(s)
    words = s.split()
    d = {}
    for w in words:
        if w in d:
            d[w] += 1
        else:
            d[w] = 1
    return d

#获取文本基本信息
def file_stats2(fname):
    """Print statistics for the given file."""
    s = open(fname,r).read()
    num_chars = len(s)
    num_lines = s.count(\n)
    d = make_freq_dict(s)
    num_words = sum(d[w] for w in d)
    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()
    print("The file %s has:" % fname)
    print("  %s characters" % num_chars)
    print("  %s lines" % num_lines)
    print("  %s words" % num_words)
    print("\nThe top 10 most frequent words are:")
    i = 1
    for count,word in lst[:99]:
        print(%2s. %4s %s % (i, count, word))
        i += 1
>>> file_stats2(a.txt)
The file a.txt has:
  12927 characters
  297 lines
  1645 words

The top 10 most frequent words are:
 1.   62 to
 2.   62 the
 3.   47 is
 4.   42 a
 5.   41 of
 6.   40 it
 7.   36 that
 8.   35 and
 9.   32 as
10.   24 so

 

进一步完善的代码:

#text.py
#保留的字符
keep = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p
        q,r,s,t,u,v,w,x,y,z, ,-,""}
#将文本规范化 
def normalize(s): 
    """Convert s to a normalized string."""
    result = ‘‘
    for c in s.lower():
        if c in keep:
            result += c
    return result

#获取文本基本信息
def file_stats(fname):
    """Print statistics for the given file."""
    s = open(fname,r).read()
    num_chars = len(s)
    num_lines = s.count(\n)
    num_words = len(normalize(s).split())
    print("The file %s has:" % fname)
    print("  %s characters" % num_chars)
    print("  %s lines" % num_lines)
    print("  %s words" % num_words)

#将字符串转化为字典
def make_freq_dict(s):
    """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""
    s = normalize(s)
    words = s.split()
    d = {}
    for w in words:
        if w in d:
            d[w] += 1
        else:
            d[w] = 1
    return d

#获取文本基本信息
def file_stats2(fname):
    """Print statistics for the given file."""
    s = open(fname,r).read()
    num_chars = len(s)
    num_lines = s.count(\n)
    d = make_freq_dict(s)
    num_different_words = sum(d[w]/d[w] for w in d)
    num_words = sum(d[w] for w in d)
    words_average_length = sum(len(w) for w in d)/num_different_words
    num_once = sum(d[w] for w in d if d[w] == 1)
    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()
    print("The file %s has:" % fname)
    print("  %s characters" % num_chars)
    print("  %s lines" % num_lines)
    print("  %s words" % num_words)
    print("  %s words appreance one time" % num_once)
    print("  %s different words" % int(num_different_words))
    print("  %s average length" % words_average_length)
    print("\nThe top 10 most frequent words are:")
    i = 1
    for count,word in lst[:10]:
        print(%2s. %4s %s % (i, count, word))
        i += 1

def main():
    file_stats2(a.txt)

if __name__==__main__:
    main()
>>> ================================ RESTART ================================
>>> 
The file a.txt has:
  12927 characters
  297 lines
  1645 words
  515 words appreance one time
  699 different words
  6.539341917024321 average length

The top 10 most frequent words are:
 1.   62 to
 2.   62 the
 3.   47 is
 4.   42 a
 5.   41 of
 6.   40 it
 7.   36 that
 8.   35 and
 9.   32 as
10.   24 so

 

【python】入门学习(十)

标签:style   blog   color   os   io   ar   for   art   div   

原文地址:http://www.cnblogs.com/dplearning/p/3956242.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!