| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- import pandas as pd
- import polars as pl
- import yaml
- from pathlib import Path
- from env import env
- columns = ["专利申请号", "IPC分类号", "专利申请日", "摘要"]
- patent_file_name = '上市公司-专利明细数据.csv'
- filerted_file_name = '上市公司-专利摘要数据-筛选.csv'
- sample_file_name = '上市公司-专利摘要数据-筛选-样本.csv'
- def read_csv():
- # 逐块读取(每次读取 10 行)
- chunk_size = 10
- for chunk in pd.read_csv(patent_file_name, chunksize=chunk_size, encoding="utf-8"):
- # 处理当前块数据
- print("Processing a chunk of size:", len(chunk))
- # read_csv()
- def get_stop_words():
- with open(str(env.resolve('conf/hit_stopwords.txt'))) as f:
- stopwords = set(line.strip() for line in f)
- with open(str(env.resolve('conf/patent_stopwords.txt'))) as f:
- stopwords |= set(line.strip() for line in f)
- return stopwords
- stopwords = get_stop_words()
- def clean_raw_text(raw_text):
- raw = raw_text.replace(r'\r', '').replace(r'\n', '').replace(r'\t', '').replace(' ', '')
- # for stop in stopwords:
- # raw = raw.replace(stop, '')
- return raw
- def get_ipc_df(ipcs, start=None, end=None):
- # 根据给定的ipc列表读取专利数据
- # 读取专利文件指定的列
- df = pl.read_csv(str(env.resolve_data(patent_file_name)), columns=columns, encoding="utf-8")
- # 构造IPC正则表达式
- regex_pattern = "|".join(ipcs)
- # 按IPC正则表达式筛选数据
- df = df.filter(pl.col('IPC分类号').str.contains(regex_pattern))
- # 按申请日期过滤
- if start is not None:
- df = df.filter(pl.col('专利申请日') >= start)
- if end is not None:
- df = df.filter(pl.col('专利申请日') <= end)
- return df
- def save_building_csv(ipcs=None, start=None, end=None):
- # 筛选并保存跟建筑工业化相关IPC的专利数据
- if ipcs is None:
- ipc_yaml = env.resolve('conf/building_ipc.yml').read_text()
- ipc_data = yaml.safe_load(ipc_yaml)
- ipcs = list(ipc_data.keys())
- df = get_ipc_df(ipcs, start, end)
- df.write_csv(str(env.resolve_output(filerted_file_name)))
- def clean_pl_df(df, dirty_column):
- cleaned_df = df.with_columns(
- pl.col(dirty_column)
- # .str.to_lowercase() # 转为小写
- # .str.replace_all(r"[^a-z]", "") # 删除非字母字符
- .replace('关注公众号“马 克 数 据 网”', '')
- .str.strip_chars() # 去除首尾空格
- .alias("fully_cleaned")
- )
- return cleaned_df
- def save_patent_description():
- df = pl.read_csv(str(env.resolve_data(patent_file_name)), columns=["摘要"], encoding="utf-8")
- df.write_csv(str(env.resolve_data('上市公司-专利明细数据-摘要.csv')))
- def save_sample_csv(fraction=0.05):
- df = pl.read_csv(str(env.resolve_output(filerted_file_name)), columns=columns, encoding="utf-8")
- # 抽取样本并保存
- df = df.sample(fraction=fraction)
- df.write_csv(str(env.resolve_output(sample_file_name)))
- if __name__ == '__main__':
- env.data_folder = '/Users/gaojun/dev/gxy-gd/论文'
- env.output_folder = '/Users/gaojun/dev/gxy-gd/论文'
- save_patent_description()
- # save_building_csv()
- # save_sample_csv()
|