lda_flow.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import prefect
  2. from prefect import flow, task
  3. import time
  4. import random
  5. import data_preparation
  6. import bio
  7. import ner
  8. from env import env
  9. # 定义任务 (Tasks)
  10. @task
  11. def filter_patent(ipcs: list, start=None, end=None):
  12. """
  13. 从文件 上市公司-专利明细数据.csv 中筛选专利数据
  14. :param ipcs: 需要过滤的 ipc 分类列表,
  15. :param start: 开始日期,'2014-01-01'
  16. :param end: 结束日期
  17. :return:
  18. """
  19. data_preparation.save_building_csv(ipcs, start, end)
  20. def sample_patent(fraction=0.05):
  21. """
  22. 从筛选的专利数据中随机取样,结果保存在文件:上市公司-专利摘要数据-筛选.csv
  23. :param fraction: 取样率
  24. :return:
  25. """
  26. data_preparation.save_building_csv(fraction)
  27. def deepseek_ner_sample():
  28. """
  29. 将取样专利数据交由deepseek进行命名实体识别,结果保存在文件:ds_sample_ner_result.yml
  30. :return:
  31. """
  32. bio.ner_sample_by_deepseek()
  33. @task
  34. def cut_words():
  35. ...
  36. def add_bio():
  37. """
  38. 根据 ds_sample_ner_result.yml 文件中识别出的命名实体进行BIO标注,结果保存在文件:上市公司-专利摘要数据-筛选-样本-BIO.csv
  39. :return:
  40. """
  41. bio.add_bio()
  42. def train(model_name):
  43. """
  44. 训练模型
  45. :return:
  46. """
  47. ner.train(model_name)
  48. def model_test():
  49. ...
  50. def _ner():
  51. ...
  52. def lda():
  53. ...
  54. if __name__ == '__main__':
  55. env.data_folder = '/Users/gaojun/dev/gxy-gd/论文'
  56. env.output_folder = '/Users/gaojun/dev/gxy-gd/论文'
  57. # ner.predict_test('hfl/chinese-roberta-wwm-ext-large-building-building')
  58. ner.train('hfl/chinese-roberta-wwm-ext-large-building-building')
  59. #
  60. # # 定义参数化的 Flow
  61. # @flow(name="Multi-Parameter Experiment Flow", log_prints=True) # log_prints=True 会自动捕获 print 语句到 Prefect 日志
  62. # def run_experiment(
  63. # data_source: str = "default_source", # 参数1: 数据源
  64. # process_level: float = 0.5, # 参数2: 处理级别
  65. # report_type: str = "txt" # 参数3: 报告格式
  66. # ):
  67. # """
  68. # 这是一个接收多个参数的实验性数据流。
  69. # """
  70. # run_id = prefect.runtime.flow_run.id # 获取当前 Flow Run 的 ID
  71. # print(f"开始 Flow Run: {run_id}")
  72. # print(f"参数: data_source='{data_source}', process_level={process_level}, report_type='{report_type}'")
  73. #
  74. # # 调用 Tasks
  75. # raw_data = load_data(source=data_source)
  76. # processing_result = process_data(data=raw_data, processing_param=process_level)
  77. # final_report = generate_report(result=processing_result, report_format=report_type)
  78. #
  79. # print(f"Flow Run {run_id} 完成.")
  80. # return final_report # Flow 可以返回值,这也会被记录
  81. #
  82. # if __name__ == "__main__":
  83. # # 定义你想要试验的参数组合
  84. # parameter_combinations = [
  85. # {"data_source": "source_A", "process_level": 0.3, "report_type": "json"},
  86. # {"data_source": "source_A", "process_level": 0.8, "report_type": "csv"},
  87. # {"data_source": "source_B", "process_level": 0.5, "report_type": "txt"},
  88. # {"data_source": "source_C", "process_level": 0.9, "report_type": "json"},
  89. # {"data_source": "source_C", "process_level": 0.2, "report_type": "txt"},
  90. # ]
  91. #
  92. # print(f"准备执行 {len(parameter_combinations)} 组实验...")
  93. #
  94. # # 循环遍历参数组合,为每一组参数执行一次 Flow
  95. # results = []
  96. # for i, params in enumerate(parameter_combinations):
  97. # print(f"\n--- 开始第 {i+1} 次实验: {params} ---")
  98. # # 直接调用 flow 函数,并传入参数
  99. # # Prefect 会自动捕获这次调用为一个 Flow Run
  100. # try:
  101. # # 使用 **params 将字典解包为关键字参数传递给 flow 函数
  102. # result = run_experiment(**params)
  103. # results.append({"params": params, "result": result, "status": "Success"})
  104. # print(f"--- 第 {i+1} 次实验完成 ---")
  105. # except Exception as e:
  106. # results.append({"params": params, "result": str(e), "status": "Failed"})
  107. # print(f"--- 第 {i+1} 次实验失败: {e} ---")
  108. #
  109. # print("\n所有本地实验执行完毕。")
  110. # # 你可以在这里对 results 列表进行分析
  111. # # print("实验结果汇总:", results)