data_preparation.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import pandas as pd
  2. import polars as pl
  3. import yaml
  4. from pathlib import Path
  5. columns = ["专利申请号", "IPC分类号", "专利申请日", "摘要"]
  6. patent_file_name = '~/Documents/论文/上市公司-专利明细数据.csv'
  7. desc_file_name = '~/Documents/论文/上市公司-专利摘要数据.csv'
  8. demo_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本.csv'
  9. def read_csv():
  10. # 逐块读取(每次读取 100,000 行)
  11. chunk_size = 10
  12. for chunk in pd.read_csv(patent_file_name, chunksize=chunk_size, encoding="utf-8"):
  13. # 处理当前块数据
  14. print("Processing a chunk of size:", len(chunk))
  15. # read_csv()
  16. def get_ipc_df(ipcs):
  17. # 根据给定的ipc列表读取专利数据
  18. # 读取专利文件指定的列
  19. df = pl.read_csv(str(Path(patent_file_name).expanduser()), columns=columns, encoding="utf-8")
  20. # 构造IPC正则表达式
  21. regex_pattern = "|".join(ipcs)
  22. # 按IPC正则表达式筛选数据
  23. df = df.filter(pl.col('IPC分类号').str.contains(regex_pattern))
  24. return df
  25. if __name__ == '__main__':
  26. # 筛选并保存跟建筑工业化相关IPC的专利数据
  27. ipc_yaml = Path('building_ipc.yml').read_text()
  28. ipc_data = yaml.safe_load(ipc_yaml)
  29. keywords_list = list(ipc_data.keys())
  30. df = get_ipc_df(keywords_list)
  31. df.write_csv(desc_file_name)
  32. # 抽取5%样本并保存
  33. df = df.sample(fraction=0.05)
  34. df.write_csv(demo_file_name)