import pandas as pd import polars as pl import yaml from pathlib import Path columns = ["专利申请号", "IPC分类号", "专利申请日", "摘要"] patent_file_name = '~/Documents/论文/上市公司-专利明细数据.csv' desc_file_name = '~/Documents/论文/上市公司-专利摘要数据.csv' demo_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本.csv' def read_csv(): # 逐块读取(每次读取 100,000 行) chunk_size = 10 for chunk in pd.read_csv(patent_file_name, chunksize=chunk_size, encoding="utf-8"): # 处理当前块数据 print("Processing a chunk of size:", len(chunk)) # read_csv() def get_ipc_df(ipcs): # 根据给定的ipc列表读取专利数据 # 读取专利文件指定的列 df = pl.read_csv(str(Path(patent_file_name).expanduser()), columns=columns, encoding="utf-8") # 构造IPC正则表达式 regex_pattern = "|".join(ipcs) # 按IPC正则表达式筛选数据 df = df.filter(pl.col('IPC分类号').str.contains(regex_pattern)) return df if __name__ == '__main__': # 筛选并保存跟建筑工业化相关IPC的专利数据 ipc_yaml = Path('building_ipc.yml').read_text() ipc_data = yaml.safe_load(ipc_yaml) keywords_list = list(ipc_data.keys()) df = get_ipc_df(keywords_list) df.write_csv(desc_file_name) # 抽取5%样本并保存 df = df.sample(fraction=0.05) df.write_csv(demo_file_name)