精华内容
下载资源
问答
  • /usr/bin/python# coding=utf-8# TF-IDF提取文本关键词# http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weightingimport sysimport osfrom config_ch import *import chardetimport ...

    #!/usr/bin/python

    # coding=utf-8

    # TF-IDF提取文本关键词

    # http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting

    import sys

    import os

    from config_ch import *

    import chardet

    import numpy as np

    import pandas as pd

    import xlrd

    import copy

    import glob

    import jieba.posseg

    import jieba.analyse

    import io

    from sklearn import feature_extraction

    from sklearn.feature_extraction.text import TfidfTransformer

    from sklearn.feature_extraction.text import CountVectorizer

    """

    TF-IDF权重:

    1、CountVectorizer 构建词频矩阵

    2、TfidfTransformer 构建TF-IDF权值计算

    3、文本的关键词

    4、对应的TF-IDF矩阵

    """

    # 数据读取

    """

    输入数据所在文件夹路径data_path, 输出data为一字典, 包含'id', 'title', 'abstract'

    """

    def dataRead(data_path):

    file_list = os.listdir(data_path)

    idList, titleList, abstractList = range(0, len(file_list)), [], [] # 构建3个list, 用于存放文本编号, 文本标题, 文本内容

    for file_name in file_list:

    file_path = os.path.join(data_path, file_name)

    if os.path.isfile(file_path):

    f = io.open(file_path, 'rb').read()

    encoding_type = chardet.detect(f) # 获取文本的编码形式

    if not encoding_type['encoding']:

    encoding_type['encoding'] = 'utf-8-sig' # 一些文本编码形式为none, 强制转换

    file = f.decode(encoding_type['encoding'])

    titleList.append(file[0:file.find('\n', 1)+1]) # 文本第一行为标题

    abstractList.append(file)

    data = {"id": idList, "title": titleList, "abstract": abstractList}

    return data

    # 预处理

    """

    输入文本text及停用词表stopword, 输出分词结果text_seg

    预处理包括jieba分词, 去停用词, 筛选词性

    """

    def dataPrepos(text, stopword):

    text_seg = []

    seg = jieba.posseg.cut(text) # 分词

    for i in seg:

    if i.word not in stopword and i.flag in pos: # 去停用词 + 筛选词性

    text_seg.append(i.word)

    return text_seg

    # 关键词映射

    """

    输入关键词key及映射表mapword, 输出key_left_mapped,

    包括映射后剩余关键词"left"及映射得到的关键词"mapped"

    映射表第1列为atom词列表, 从第2列起为替换词列表,

    若key中某词属于atom列表, 则将该atom对应的替换词加入mappedList, 并从leftList中删除该词,

    若key中某词本身属于替换词列表, 则将该词加入mappedList, 并从leftList中删除

    """

    def keysMapping(key, mapword):# key中关键词若存在于atom中,则加入mappedList,leftList只保留未出现在atom中的关键词

    leftList, mappedList = copy.deepcopy(key), [] # 初始化leftList, mappedList

    atom = mapword.col_values(0)

    for i in key:

    if i in atom: # 关键词为atom列表中的词, 则用对应的替换词进行替换

    mappedList.extend(mapword.row_values(atom.index(i))[1:])

    mappedList = list(filter(None, mappedList)) # 去除""字符串

    leftList.pop(leftList.index(i)) # 从leftList中删除

    else:

    for n in range(len(atom)):

    row = mapword.row_values(n)[1:]

    if i in row: # 关键词本身为替换词列表中的词, 则加入mappedList, 并从leftList中删除

    mappedList.extend([i])

    leftList.pop(leftList.index(i))

    break

    mappedList = list(set(mappedList)) # 去除重复词

    key_left_mapped = {"left": leftList, "mapped": mappedList}

    return key_left_mapped

    # TF-IDF提取topK关键词

    """

    输入包括数据data, 停用词表stopword, 映射表mapword, 及中间变量mapped和keys_all,

    当mode为'tf'时, 每个文本单独调用getKeyword, 需传入文本id,

    当mode为'tfidf'时, 多个文本作为整体只调用一次getKeyword, 不需id, 令id = 0

    """

    def getKeywords(data, id, stopword, mapword, mapped, keys_all):

    # 从data中取出id, title, abstract, 构建3个list

    if mode == 'tfidf':

    idList, titleList, abstractList = data['id'], data['title'], data['abstract']

    elif mode == 'tf': # 取出第id个文本的信息

    idList, titleList, abstractList = [data['id'][id]], [data['title'][id]], [data['abstract'][id]]

    corpus = [] # 将所有文本到输出到一个list中, 每行为一个文本

    result = pd.DataFrame({"id": [], "title": [], "key": [], "left": [], "mapped": []},

    columns=['id', 'title', 'key', 'left', 'mapped'])

    # 分别对每个文本进行预处理, 将处理后的词连接成字符串(空格分隔), 输入到corpus中的一行

    for index in range(len(idList)):

    text = '%s' % abstractList[index]

    text_seg = dataPrepos(text, stopword)

    text_seg = " ".join(text_seg)

    corpus.append(text_seg)

    if corpus == ['']:

    return result # 空文本

    # 1、构建词频矩阵,将文本中的词语转换成词频矩阵

    vectorizer = CountVectorizer()

    X = vectorizer.fit_transform(corpus) # 词频矩阵

    # 2、统计每个词的TF-IDF权值

    transformer = TfidfTransformer()

    tfidf = transformer.fit_transform(X)

    # 3、获取词袋模型中的关键词

    word = vectorizer.get_feature_names()

    # 4、获取TF-IDF矩阵

    weight = tfidf.toarray()

    # 5、打印词语权重

    # 以下变量分别用于存放文本编号, 标题, 提取出的关键词, 映射得到的关键词, 映射后剩余的关键词

    ids, titles, keys, keys_mapped, keys_left = [], [], [], [], []

    for i in range(len(weight)):

    print(u"-------这里输出第", i+1, u"篇文本的词语TF-IDF------")

    ids.append(idList[i]) # 添加编号到ids

    titles.append(titleList[i]) # 添加标题到titles

    df_word, df_weight = [], [] # 当前文本的所有词汇列表、词汇对应权重列表

    for j in range(len(word)):

    print(word[j], weight[i][j])

    if weight[i][j] == 0:

    df_word.append(' ') # 用空字符串替换权重为0的词

    else:

    df_word.append(word[j])

    df_weight.append(weight[i][j])

    # 将df_word和df_weight转换为pandas中的DataFrame形式, 用于排序

    df_word = pd.DataFrame(df_word, columns=['word'])

    df_weight = pd.DataFrame(df_weight, columns=['weight'])

    word_weight = pd.concat([df_word, df_weight], axis=1) # 拼接词汇列表和权重列表

    word_weight = word_weight.sort_values(by="weight", ascending=False) # 按照权重值降序排列

    keyword = np.array(word_weight['word']) # 选择词汇列并转成数组格式

    key = [keyword[x] for x in range(0, min(topK, len(word)))] # 抽取前topK个词汇作为关键词

    keys_all.extend(key) # 将当前文本提取出的关键词加入keys_all中, 用于后续的高频关键词提取

    # 关键词映射

    key_left_mapped = keysMapping(key, mapword)

    # 将list中的词连接成字符串

    key = " ".join(key)

    key_left_split = " ".join(key_left_mapped["left"])

    key_mapped_split = " ".join(key_left_mapped["mapped"])

    mapped.extend(key_left_mapped["mapped"]) # 将每个文本映射后的关键词合并到mapped中, 有重复

    keys.append(key)

    keys_left.append(key_left_split)

    keys_mapped.append(key_mapped_split)

    result = pd.DataFrame({"id": ids, "title": titles, "key": keys, "left": keys_left, "mapped": keys_mapped}, columns=['id', 'title', 'key', 'left', 'mapped'])

    return result

    # 提取topN高频关键词

    """

    输入keys_all为每个文本提取出的topK关键词合并后的列表,

    输出key_most为提取出的topN个高频关键词

    """

    def getKeymost(keys_all):

    counts = []

    keys_nodup = list(set(keys_all)) # keys_all去重后结果

    for item in keys_nodup:

    counts.append(keys_all.count(item)) # 统计每个关键词出现的次数

    key_word = pd.DataFrame(keys_nodup, columns=['key'])

    count_word = pd.DataFrame(counts, columns=['count'])

    key_count = pd.concat([key_word, count_word], axis=1)

    key_count = key_count.sort_values(by="count", ascending=False)

    key_freq = np.array(key_count['key'])

    key_most = [key_freq[x] for x in range(0, min(topN, len(key_word)))]

    return key_most

    def main():

    # 删除历史结果

    for f in glob.glob(os.path.join('result', '*.xls')):

    os.remove(f)

    # 加载停用词表

    stopword = [w.strip() for w in io.open(stopword_path, 'r', encoding='UTF-8').readlines()]

    # 加载映射表

    mapword = xlrd.open_workbook(map_path).sheet_by_index(0)

    # 加载自定义字典,用于jieba分词

    jieba.load_userdict(dict_path)

    folderList = os.listdir(data_path)

    for folder in folderList: # 遍历全部电影文件夹, 每个文件夹中为1部电影的全部影评

    folder_path = os.path.join(data_path, folder)

    # 读取数据

    data = dataRead(folder_path)

    keys_all = [] # 用于存放所有文本提取出的关键词

    mapped = [] # 用于合并所有文本映射后的关键词

    # 关键词提取,

    if mode == 'tfidf':

    result = getKeywords(data, 0, stopword, mapword, mapped, keys_all)

    result.to_csv("result/CHkeys_tfidf_" + folder + ".xls", index=False, encoding='utf-8-sig')

    elif mode == 'tf':

    for i in range(len(data['id'])): # 'tf'模式下, 每个文本单独调用getKeywords

    result = getKeywords(data, i, stopword, mapword, mapped, keys_all)

    result.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', header=False, index=False, encoding='utf-8-sig')

    mapped = list(set(mapped)) # 去除重复词

    mapped_result = pd.DataFrame({"mapped": [" ".join(mapped)]}, columns=['mapped'])

    pd.DataFrame({"": [" ".join([])]}).to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False) # 增加空行

    mapped_result.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False, encoding='utf-8-sig', columns=['', '', 'mapped'])

    # 提取高频关键词

    key_most = getKeymost(keys_all)

    key_most = pd.DataFrame({"most mentioned": [" ".join(key_most)]}, columns=['most mentioned'])

    pd.DataFrame({"": [" ".join([])]}).to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False) # 增加空行

    key_most.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False, encoding='utf-8-sig', columns=['', '', 'most mentioned'])

    if __name__ == '__main__':

    main()

    展开全文
  • python 关键词提取 (jieba+sklearn)

    千次阅读 2020-06-26 15:21:52
    python 关键词提取 (jieba+sklearn) 原文链接:https://www.jianshu.com/p/85a0e7a7bebf #!/usr/bin/python # coding=utf-8 # TF-IDF提取文本关键词 # ...

    python 关键词提取 (jieba+sklearn)
    原文链接:https://www.jianshu.com/p/85a0e7a7bebf

    #!/usr/bin/python
    # coding=utf-8
    # TF-IDF提取文本关键词
    # http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting
    
    import sys
    import os
    from config_ch import *
    import chardet
    import numpy as np
    import pandas as pd
    import xlrd
    import copy
    import glob
    import jieba.posseg
    import jieba.analyse
    import io
    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    """
           TF-IDF权重:
               1、CountVectorizer  构建词频矩阵
               2、TfidfTransformer 构建TF-IDF权值计算
               3、文本的关键词
               4、对应的TF-IDF矩阵
    """
    # 数据读取
    """
        输入数据所在文件夹路径data_path, 输出data为一字典, 包含'id', 'title', 'abstract'
    """
    def dataRead(data_path):
        file_list = os.listdir(data_path)
        idList, titleList, abstractList = range(0, len(file_list)), [], []  # 构建3个list, 用于存放文本编号, 文本标题, 文本内容
        for file_name in file_list:
            file_path = os.path.join(data_path, file_name)
            if os.path.isfile(file_path):
                f = io.open(file_path, 'rb').read()
                encoding_type = chardet.detect(f)  # 获取文本的编码形式
                if not encoding_type['encoding']:
                    encoding_type['encoding'] = 'utf-8-sig'  # 一些文本编码形式为none, 强制转换
                file = f.decode(encoding_type['encoding'])
                titleList.append(file[0:file.find('\n', 1)+1])  # 文本第一行为标题
                abstractList.append(file)
        data = {"id": idList, "title": titleList, "abstract": abstractList}
        return data
    
    # 预处理
    """
        输入文本text及停用词表stopword, 输出分词结果text_seg
        预处理包括jieba分词, 去停用词, 筛选词性
    """
    def dataPrepos(text, stopword):
        text_seg = []
        seg = jieba.posseg.cut(text)  # 分词
        for i in seg:
            if i.word not in stopword and i.flag in pos:  # 去停用词 + 筛选词性
                text_seg.append(i.word)
        return text_seg
    
    # 关键词映射
    """
        输入关键词key及映射表mapword, 输出key_left_mapped,
        包括映射后剩余关键词"left"及映射得到的关键词"mapped"
        映射表第1列为atom词列表, 从第2列起为替换词列表,
        若key中某词属于atom列表, 则将该atom对应的替换词加入mappedList, 并从leftList中删除该词,
        若key中某词本身属于替换词列表, 则将该词加入mappedList, 并从leftList中删除
    """
    def keysMapping(key, mapword):# key中关键词若存在于atom中,则加入mappedList,leftList只保留未出现在atom中的关键词
        leftList, mappedList = copy.deepcopy(key), []  # 初始化leftList, mappedList
        atom = mapword.col_values(0)
        for i in key:
            if i in atom:  # 关键词为atom列表中的词, 则用对应的替换词进行替换
                mappedList.extend(mapword.row_values(atom.index(i))[1:])
                mappedList = list(filter(None, mappedList))  # 去除""字符串
                leftList.pop(leftList.index(i))  # 从leftList中删除
            else:
                for n in range(len(atom)):
                    row = mapword.row_values(n)[1:]
                    if i in row:  # 关键词本身为替换词列表中的词, 则加入mappedList, 并从leftList中删除
                        mappedList.extend([i])
                        leftList.pop(leftList.index(i))
                        break
    
        mappedList = list(set(mappedList))  # 去除重复词
        key_left_mapped = {"left": leftList, "mapped": mappedList}
        return key_left_mapped
    
    # TF-IDF提取topK关键词
    """
        输入包括数据data, 停用词表stopword, 映射表mapword, 及中间变量mapped和keys_all,
        当mode为'tf'时, 每个文本单独调用getKeyword, 需传入文本id,
        当mode为'tfidf'时, 多个文本作为整体只调用一次getKeyword, 不需id, 令id = 0
    """
    def getKeywords(data, id, stopword, mapword, mapped, keys_all):
        # 从data中取出id, title, abstract, 构建3个list
        if mode == 'tfidf':
            idList, titleList, abstractList = data['id'], data['title'], data['abstract']
        elif mode == 'tf':  # 取出第id个文本的信息
            idList, titleList, abstractList = [data['id'][id]], [data['title'][id]], [data['abstract'][id]]
    
        corpus = []  # 将所有文本到输出到一个list中, 每行为一个文本
        result = pd.DataFrame({"id": [], "title": [], "key": [], "left": [], "mapped": []},
                              columns=['id', 'title', 'key', 'left', 'mapped'])
        # 分别对每个文本进行预处理, 将处理后的词连接成字符串(空格分隔), 输入到corpus中的一行
        for index in range(len(idList)):
            text = '%s' % abstractList[index]
            text_seg = dataPrepos(text, stopword)
            text_seg = " ".join(text_seg)
            corpus.append(text_seg)
        if corpus == ['']:
            return result  # 空文本
        # 1、构建词频矩阵,将文本中的词语转换成词频矩阵
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)  # 词频矩阵
        # 2、统计每个词的TF-IDF权值
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(X)
        # 3、获取词袋模型中的关键词
        word = vectorizer.get_feature_names()
        # 4、获取TF-IDF矩阵
        weight = tfidf.toarray()
        # 5、打印词语权重
        # 以下变量分别用于存放文本编号, 标题, 提取出的关键词, 映射得到的关键词, 映射后剩余的关键词
        ids, titles, keys, keys_mapped, keys_left = [], [], [], [], []
        for i in range(len(weight)):
            print(u"-------这里输出第", i+1, u"篇文本的词语TF-IDF------")
            ids.append(idList[i])  # 添加编号到ids
            titles.append(titleList[i])  # 添加标题到titles
            df_word, df_weight = [], []  # 当前文本的所有词汇列表、词汇对应权重列表
            for j in range(len(word)):
                print(word[j], weight[i][j])
                if weight[i][j] == 0:
                    df_word.append(' ')  # 用空字符串替换权重为0的词
                else:
                    df_word.append(word[j])
                df_weight.append(weight[i][j])
            # 将df_word和df_weight转换为pandas中的DataFrame形式, 用于排序
            df_word = pd.DataFrame(df_word, columns=['word'])
            df_weight = pd.DataFrame(df_weight, columns=['weight'])
            word_weight = pd.concat([df_word, df_weight], axis=1)  # 拼接词汇列表和权重列表
            word_weight = word_weight.sort_values(by="weight", ascending=False)  # 按照权重值降序排列
            keyword = np.array(word_weight['word'])  # 选择词汇列并转成数组格式
            key = [keyword[x] for x in range(0, min(topK, len(word)))]  # 抽取前topK个词汇作为关键词
            keys_all.extend(key)  # 将当前文本提取出的关键词加入keys_all中, 用于后续的高频关键词提取
    
            # 关键词映射
            key_left_mapped = keysMapping(key, mapword)
            # 将list中的词连接成字符串
            key = " ".join(key)
            key_left_split = " ".join(key_left_mapped["left"])
            key_mapped_split = " ".join(key_left_mapped["mapped"])
    
            mapped.extend(key_left_mapped["mapped"])  # 将每个文本映射后的关键词合并到mapped中, 有重复
    
            keys.append(key)
            keys_left.append(key_left_split)
            keys_mapped.append(key_mapped_split)
    
        result = pd.DataFrame({"id": ids, "title": titles, "key": keys, "left": keys_left, "mapped": keys_mapped}, columns=['id', 'title', 'key', 'left', 'mapped'])
        return result
    
    # 提取topN高频关键词
    """
        输入keys_all为每个文本提取出的topK关键词合并后的列表,
        输出key_most为提取出的topN个高频关键词
    """
    def getKeymost(keys_all):
        counts = []
        keys_nodup = list(set(keys_all))  # keys_all去重后结果
        for item in keys_nodup:
            counts.append(keys_all.count(item))  # 统计每个关键词出现的次数
        key_word = pd.DataFrame(keys_nodup, columns=['key'])
        count_word = pd.DataFrame(counts, columns=['count'])
        key_count = pd.concat([key_word, count_word], axis=1)
        key_count = key_count.sort_values(by="count", ascending=False)
        key_freq = np.array(key_count['key'])
    
        key_most = [key_freq[x] for x in range(0, min(topN, len(key_word)))]
        return key_most
    
    
    def main():
    
        # 删除历史结果
        for f in glob.glob(os.path.join('result', '*.xls')):
            os.remove(f)
    
        # 加载停用词表
        stopword = [w.strip() for w in io.open(stopword_path, 'r', encoding='UTF-8').readlines()]
    
        # 加载映射表
        mapword = xlrd.open_workbook(map_path).sheet_by_index(0)
    
        # 加载自定义字典,用于jieba分词
        jieba.load_userdict(dict_path)
    
        folderList = os.listdir(data_path)
    
        for folder in folderList:  # 遍历全部电影文件夹, 每个文件夹中为1部电影的全部影评
            folder_path = os.path.join(data_path, folder)
    
            # 读取数据
            data = dataRead(folder_path)
    
            keys_all = []  # 用于存放所有文本提取出的关键词
            mapped = []  # 用于合并所有文本映射后的关键词
    
            # 关键词提取,
            if mode == 'tfidf':
                result = getKeywords(data, 0, stopword, mapword, mapped, keys_all)
                result.to_csv("result/CHkeys_tfidf_" + folder + ".xls", index=False, encoding='utf-8-sig')
            elif mode == 'tf':
                for i in range(len(data['id'])):  # 'tf'模式下, 每个文本单独调用getKeywords
                    result = getKeywords(data, i, stopword, mapword, mapped, keys_all)
                    result.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', header=False, index=False, encoding='utf-8-sig')
    
            mapped = list(set(mapped))  # 去除重复词
            mapped_result = pd.DataFrame({"mapped": [" ".join(mapped)]}, columns=['mapped'])
            pd.DataFrame({"": [" ".join([])]}).to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False)  # 增加空行
            mapped_result.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False, encoding='utf-8-sig', columns=['', '', 'mapped'])
    
            # 提取高频关键词
            key_most = getKeymost(keys_all)
            key_most = pd.DataFrame({"most mentioned": [" ".join(key_most)]}, columns=['most mentioned'])
            pd.DataFrame({"": [" ".join([])]}).to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False)  # 增加空行
            key_most.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False, encoding='utf-8-sig', columns=['', '', 'most mentioned'])
    
    
    if __name__ == '__main__':
        main()
    
    展开全文
  • 本文实例讲述了python提取内容关键词的方法。分享给大家供大家参考。具体分析如下:一个非常高效的提取内容关键词python代码,这段代码只能用于英文文章内容,中文因为要分词,这段代码就无能为力了,不过要加上...

    本文实例讲述了python提取内容关键词的方法。分享给大家供大家参考。具体分析如下:

    一个非常高效的提取内容关键词的python代码,这段代码只能用于英文文章内容,中文因为要分词,这段代码就无能为力了,不过要加上分词功能,效果和英文是一样的。

    # coding=UTF-8

    import nltk

    from nltk.corpus import brown

    # This is a fast and simple noun phrase extractor (based on NLTK)

    # Feel free to use it, just keep a link back to this post

    # http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/

    # Create by Shlomi Babluki

    # May, 2013

    # This is our fast Part of Speech tagger

    #############################################################################

    brown_train = brown.tagged_sents(categories='news')

    regexp_tagger = nltk.RegexpTagger(

    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),

    (r'(-|:|;)$', ':'),

    (r'\'*$', 'MD'),

    (r'(The|the|A|a|An|an)$', 'AT'),

    (r'.*able$', 'JJ'),

    (r'^[A-Z].*$', 'NNP'),

    (r'.*ness$', 'NN'),

    (r'.*ly$', 'RB'),

    (r'.*s$', 'NNS'),

    (r'.*ing$', 'VBG'),

    (r'.*ed$', 'VBD'),

    (r'.*', 'NN')

    ])

    unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)

    bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)

    #############################################################################

    # This is our semi-CFG; Extend it according to your own needs

    #############################################################################

    cfg = {}

    cfg["NNP+NNP"] = "NNP"

    cfg["NN+NN"] = "NNI"

    cfg["NNI+NN"] = "NNI"

    cfg["JJ+JJ"] = "JJ"

    cfg["JJ+NN"] = "NNI"

    #############################################################################

    class NPExtractor(object):

    def __init__(self, sentence):

    self.sentence = sentence

    # Split the sentence into singlw words/tokens

    def tokenize_sentence(self, sentence):

    tokens = nltk.word_tokenize(sentence)

    return tokens

    # Normalize brown corpus' tags ("NN", "NN-PL", "NNS" > "NN")

    def normalize_tags(self, tagged):

    n_tagged = []

    for t in tagged:

    if t[1] == "NP-TL" or t[1] == "NP":

    n_tagged.append((t[0], "NNP"))

    continue

    if t[1].endswith("-TL"):

    n_tagged.append((t[0], t[1][:-3]))

    continue

    if t[1].endswith("S"):

    n_tagged.append((t[0], t[1][:-1]))

    continue

    n_tagged.append((t[0], t[1]))

    return n_tagged

    # Extract the main topics from the sentence

    def extract(self):

    tokens = self.tokenize_sentence(self.sentence)

    tags = self.normalize_tags(bigram_tagger.tag(tokens))

    merge = True

    while merge:

    merge = False

    for x in range(0, len(tags) - 1):

    t1 = tags[x]

    t2 = tags[x + 1]

    key = "%s+%s" % (t1[1], t2[1])

    value = cfg.get(key, '')

    if value:

    merge = True

    tags.pop(x)

    tags.pop(x)

    match = "%s %s" % (t1[0], t2[0])

    pos = value

    tags.insert(x, (match, pos))

    break

    matches = []

    for t in tags:

    if t[1] == "NNP" or t[1] == "NNI":

    #if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN":

    matches.append(t[0])

    return matches

    # Main method, just run "python np_extractor.py"

    def main():

    sentence = "Swayy is a beautiful new dashboard for discovering and curating online content."

    np_extractor = NPExtractor(sentence)

    result = np_extractor.extract()

    print "This sentence is about: %s" % ", ".join(result)

    if __name__ == '__main__':

    main()

    希望本文所述对大家的Python程序设计有所帮助。

    展开全文
  • I would like to extract the corresponding keywords from each unique file using RAKE for Python. I've reviewed the documentation for RAKE; however, the suggested code in the tutorial gets keywords for...

    I am a novice user and puzzled over the following otherwise simple "loop" problem. I have a local dir with x number of files (about 500 .txt files). I would like to extract the corresponding keywords from each unique file using RAKE for Python. I've reviewed the documentation for RAKE; however, the suggested code in the tutorial gets keywords for a single document. Can someone please explain to me how to loop over an X number of files stored in my local dir. Here's the code from the tutorial and it words really well for a single document.

    $git clone https://github.com/zelandiya/RAKE-tutorial

    import rake

    import operator

    rake_object = rake.Rake("SmartStoplist.txt", 5, 3, 4)

    sample_file = open("data/docs/fao_test/w2167e.txt", 'r')

    text = sample_file.read()

    keywords = rake_object.run(text)

    print "Keywords:", keywords

    解决方案

    Create a list of filenames you want to process:

    filenames = [

    'data/docs/fao_test/w2167e.txt',

    'some/other/folder/filename.txt',

    etc...

    ]

    If you don't want to hardcode all the names, you can use the glob module to collect filenames by wildcards.

    Create a dictionary for storing the results:

    results = {}

    Loop through each filename, reading the contents and storing the Rake results in the dictionary, keyed by filename:

    for filename in filenames:

    with open(filename, 'r') as fp:

    results[filename] = rake_object.run(fp.read())

    展开全文
  • 我想在API中提供自动字符串格式,例如:my_api("path/to/{self.category}/...如何从Python格式字符串中提取关键字参数:"non-keyword {keyword1} {{escaped brackets}} {} {keyword2}" => 'keyword1', 'keyword2'pyt...
  • [Python] 纯文本查看 复制代码# -*- coding:utf-8 -*-#Author:MercuryYeimport urllib.requestimport numpy as npimport pandas as pdimport jieba.analysefrom bs4 import BeautifulSoup###爬虫部分###url = ...
  • change_words(‘my_age‘,new_age) if __name__ == "__main__": new_name=‘phyger‘ change_name(new_name) new_age=88 change_age(new_age) 3、运行后结果 C:\Users\Administrator\Desktop\document>D:/Python37/...
  • python怎么提取关键词import re f = open("D:/xiangmu/python/xiangmu/gjc.txt", "r", encodi欢迎来到四十五资源网, 那个r'.*?('+ lste +').*?‘ 会吧你这个关键字前面和后面的文字都匹配了,所以当你的那个关键字...
  • 原标题:python用jieba模块分词实现关键词提取每个txt文件夹里面存放一个用户的全部微博数据,在result_all文件里面存放了全部用户的微博数据,这里实现读取每个用户的数据并为每个用户提取30个关键字。将为每个用户...
  • python实现关键词提取

    万次阅读 热门讨论 2017-09-13 22:17:59
    python实现关键词提取新人小菜鸟又来写博客啦!!!没人表示不开心~~(>_<)~~ 今天我来弄一个简单的关键词提取的代码 文章内容关键词的提取分为三大步: (1) 分词 (2) 去停用词 (3) 关键词提取分词方法有...
  • python 实现关键词提取

    万次阅读 2019-02-10 13:42:12
    Python 实现关键词提取 这篇文章只介绍了Python关键词提取的实现。 关键词提取的几个方法:1.textrank 2.tf-idf 3.LDA,其中textrank和tf-idf在jieba中都有封装好的函数,调用起来十分简单便捷。常用的自然语言...
  • 下面小编就为大家分享一篇python实现关键词提取的示例讲解,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧
  • 本文实例讲述了python提取内容关键词的方法。分享给大家供大家参考。具体分析如下:一个非常高效的提取内容关键词python代码,这段代码只能用于英文文章内容,中文因为要分词,这段代码就无能为力了,不过要加上...
  • 经常需要通过python代码来提取文本的关键词,用于文本分析。而实际应用中文本量又是大量的数据,如果使用单进程的话,效率会比较低,因此可以考虑使用多进程。python的多进程只需要使用multiprocessing的模块就行,...
  • 简单的关键词提取的代码 文章内容关键词的提取分为三大步: (1) 分词 (2) 去停用词 (3) 关键词提取 分词方法有很多,我这里就选择常用的结巴jieba分词;去停用词,我用了一个停用词表。具体代码如下: import ...
  • RAKE关键词提取python代码,python源码RAKE关键词提取python代码,python源码
  • 主要为大家详细介绍了python实现textrank关键词提取,具有一定的参考价值,感兴趣的小伙伴们可以参考一下
  • 接下来介绍一个python项目,经过笔者的改造后,可以方便学习和使用,它能很好、很快地提取文章关键词。先喝杯咖啡,让我们开始python之旅环境配置python版本: 3.6.0编辑器: pycharm项目所需要的环境安装包pip ...
  • python实现的textRank算法,用于文本无语料训练下的关键词提取

空空如也

空空如也

1 2 3 4 5 ... 20
收藏数 15,213
精华内容 6,085
关键字:

python关键词提取

python 订阅