data_preparation.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import pandas as pd
  2. import polars as pl
  3. import yaml
  4. from pathlib import Path
  5. columns = ["专利申请号", "IPC分类号", "专利申请日", "摘要"]
  6. patent_file_name = '~/Documents/论文/上市公司-专利明细数据.csv'
  7. desc_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据.csv'
  8. demo_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本.csv'
  9. def read_csv():
  10. # 逐块读取(每次读取 100,000 行)
  11. chunk_size = 10
  12. for chunk in pd.read_csv(patent_file_name, chunksize=chunk_size, encoding="utf-8"):
  13. # 处理当前块数据
  14. print("Processing a chunk of size:", len(chunk))
  15. # read_csv()
  16. def get_ipc_df(ipcs):
  17. # 根据给定的ipc列表读取专利数据
  18. # 读取专利文件指定的列
  19. df = pl.read_csv(str(Path(patent_file_name).expanduser()), columns=columns, encoding="utf-8")
  20. # 构造IPC正则表达式
  21. regex_pattern = "|".join(ipcs)
  22. # 按IPC正则表达式筛选数据
  23. df = df.filter(pl.col('IPC分类号').str.contains(regex_pattern))
  24. return df
  25. def save_building_csv():
  26. # 筛选并保存跟建筑工业化相关IPC的专利数据
  27. ipc_yaml = Path('building_ipc.yml').read_text()
  28. ipc_data = yaml.safe_load(ipc_yaml)
  29. keywords_list = list(ipc_data.keys())
  30. df = get_ipc_df(keywords_list)
  31. df.write_csv(desc_file_name)
  32. def read_ner_result():
  33. yaml_str = Path('ds_patents_result.yml').read_text(encoding='utf-8')
  34. ner_result = yaml.safe_load(yaml_str)
  35. return ner_result
  36. if __name__ == '__main__':
  37. # save_building_csv()
  38. df = pl.read_csv(str(Path(desc_file_name).expanduser()), columns=columns, encoding="utf-8")
  39. # 抽取5%样本并保存
  40. df = df.sample(fraction=0.20)
  41. df.write_csv(demo_file_name)