import pandas as pd import polars as pl import yaml from pathlib import Path columns = ["专利申请号", "IPC分类号", "专利申请日", "摘要"] patent_file_name = '~/Documents/论文/上市公司-专利明细数据.csv' desc_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据.csv' demo_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本.csv' def read_csv(): # 逐块读取(每次读取 100,000 行) chunk_size = 10 for chunk in pd.read_csv(patent_file_name, chunksize=chunk_size, encoding="utf-8"): # 处理当前块数据 print("Processing a chunk of size:", len(chunk)) # read_csv() def get_ipc_df(ipcs): # 根据给定的ipc列表读取专利数据 # 读取专利文件指定的列 df = pl.read_csv(str(Path(patent_file_name).expanduser()), columns=columns, encoding="utf-8") # 构造IPC正则表达式 regex_pattern = "|".join(ipcs) # 按IPC正则表达式筛选数据 df = df.filter(pl.col('IPC分类号').str.contains(regex_pattern)) return df def save_building_csv(): # 筛选并保存跟建筑工业化相关IPC的专利数据 ipc_yaml = Path('building_ipc.yml').read_text() ipc_data = yaml.safe_load(ipc_yaml) keywords_list = list(ipc_data.keys()) df = get_ipc_df(keywords_list) df.write_csv(desc_file_name) def read_ner_result(): yaml_str = Path('ds_patents_result.yml').read_text(encoding='utf-8') ner_result = yaml.safe_load(yaml_str) return ner_result if __name__ == '__main__': # save_building_csv() df = pl.read_csv(str(Path(desc_file_name).expanduser()), columns=columns, encoding="utf-8") # 抽取5%样本并保存 df = df.sample(fraction=0.20) df.write_csv(demo_file_name)