data_preparation.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. import pandas as pd
  2. import polars as pl
  3. import yaml
  4. from pathlib import Path
  5. from env import env
  6. columns = ["专利申请号", "IPC分类号", "专利申请日", "摘要"]
  7. patent_file_name = '上市公司-专利明细数据.csv'
  8. filerted_file_name = '上市公司-专利摘要数据-筛选.csv'
  9. sample_file_name = '上市公司-专利摘要数据-筛选-样本.csv'
  10. def read_csv():
  11. # 逐块读取(每次读取 10 行)
  12. chunk_size = 10
  13. for chunk in pd.read_csv(patent_file_name, chunksize=chunk_size, encoding="utf-8"):
  14. # 处理当前块数据
  15. print("Processing a chunk of size:", len(chunk))
  16. # read_csv()
  17. def get_stop_words():
  18. with open(str(env.resolve('conf/hit_stopwords.txt'))) as f:
  19. stopwords = set(line.strip() for line in f)
  20. with open(str(env.resolve('conf/patent_stopwords.txt'))) as f:
  21. stopwords |= set(line.strip() for line in f)
  22. return stopwords
  23. stopwords = get_stop_words()
  24. def clean_raw_text(raw_text):
  25. raw = raw_text.replace(r'\r', '').replace(r'\n', '').replace(r'\t', '').replace(' ', '')
  26. # for stop in stopwords:
  27. # raw = raw.replace(stop, '')
  28. return raw
  29. def get_ipc_df(ipcs, start=None, end=None):
  30. # 根据给定的ipc列表读取专利数据
  31. # 读取专利文件指定的列
  32. df = pl.read_csv(str(env.resolve_data(patent_file_name)), columns=columns, encoding="utf-8")
  33. # 构造IPC正则表达式
  34. regex_pattern = "|".join(ipcs)
  35. # 按IPC正则表达式筛选数据
  36. df = df.filter(pl.col('IPC分类号').str.contains(regex_pattern))
  37. # 按申请日期过滤
  38. if start is not None:
  39. df = df.filter(pl.col('专利申请日') >= start)
  40. if end is not None:
  41. df = df.filter(pl.col('专利申请日') <= end)
  42. return df
  43. def save_building_csv(ipcs=None, start=None, end=None):
  44. # 筛选并保存跟建筑工业化相关IPC的专利数据
  45. if ipcs is None:
  46. ipc_yaml = env.resolve('conf/building_ipc.yml').read_text()
  47. ipc_data = yaml.safe_load(ipc_yaml)
  48. ipcs = list(ipc_data.keys())
  49. df = get_ipc_df(ipcs, start, end)
  50. df.write_csv(str(env.resolve_output(filerted_file_name)))
  51. def clean_pl_df(df, dirty_column):
  52. cleaned_df = df.with_columns(
  53. pl.col(dirty_column)
  54. # .str.to_lowercase() # 转为小写
  55. # .str.replace_all(r"[^a-z]", "") # 删除非字母字符
  56. .replace('关注公众号“马 克 数 据 网”', '')
  57. .str.strip_chars() # 去除首尾空格
  58. .alias("fully_cleaned")
  59. )
  60. return cleaned_df
  61. def save_patent_description():
  62. df = pl.read_csv(str(env.resolve_data(patent_file_name)), columns=["摘要"], encoding="utf-8")
  63. df.write_csv(str(env.resolve_data('上市公司-专利明细数据-摘要.csv')))
  64. def save_sample_csv(fraction=0.05):
  65. df = pl.read_csv(str(env.resolve_output(filerted_file_name)), columns=columns, encoding="utf-8")
  66. # 抽取样本并保存
  67. df = df.sample(fraction=fraction)
  68. df.write_csv(str(env.resolve_output(sample_file_name)))
  69. if __name__ == '__main__':
  70. env.data_folder = '/Users/gaojun/dev/gxy-gd/论文'
  71. env.output_folder = '/Users/gaojun/dev/gxy-gd/论文'
  72. save_patent_description()
  73. # save_building_csv()
  74. # save_sample_csv()