adwardlee ShiwenNi commited on
Commit
4c29b8a
·
0 Parent(s):

Duplicate from ShiwenNi/ChatReviewer

Browse files

Co-authored-by: ShiwenNi <[email protected]>

Files changed (5) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +201 -0
  4. get_paper_from_pdf.py +193 -0
  5. requirements.txt +8 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ChatReviewer
3
+ emoji: 💩
4
+ colorFrom: red
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.22.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: ShiwenNi/ChatReviewer
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import re
4
+ import datetime
5
+ import time
6
+ import openai, tenacity
7
+ import argparse
8
+ import configparser
9
+ import json
10
+ import tiktoken
11
+ from get_paper_from_pdf import Paper
12
+ import gradio
13
+
14
+ # 定义Reviewer类
15
+ class Reviewer:
16
+ # 初始化方法,设置属性
17
+ def __init__(self, api, review_format, paper_pdf, language):
18
+ self.api = api
19
+ self.review_format = review_format
20
+
21
+ self.language = language
22
+
23
+ self.max_token_num = 4096
24
+ self.encoding = tiktoken.get_encoding("gpt2")
25
+
26
+
27
+ def review_by_chatgpt(self, paper_list):
28
+ for paper_index, paper in enumerate(paper_list):
29
+ sections_of_interest = self.stage_1(paper)
30
+ # extract the essential parts of the paper
31
+ text = ''
32
+ try:
33
+ text += 'Title:' + paper.title + '. '
34
+ text += 'Abstract: ' + paper.section_texts['Abstract']
35
+ except:
36
+ pass
37
+ intro_title = next((item for item in paper.section_names if 'ntroduction' in item.lower()), None)
38
+ if intro_title is not None:
39
+ text += 'Introduction: ' + paper.section_texts[intro_title]
40
+ # Similar for conclusion section
41
+ conclusion_title = next((item for item in paper.section_names if 'onclusion' in item), None)
42
+ if conclusion_title is not None:
43
+ text += 'Conclusion: ' + paper.section_texts[conclusion_title]
44
+ for heading in sections_of_interest:
45
+ if heading in paper.section_names:
46
+ text += heading + ': ' + paper.section_texts[heading]
47
+ chat_review_text, total_token_used = self.chat_review(text=text)
48
+ return chat_review_text, total_token_used
49
+
50
+
51
+
52
+ def stage_1(self, paper):
53
+ htmls = []
54
+ text = ''
55
+ paper_Abstract = 'Abstract'
56
+ try:
57
+ text += 'Title:' + paper.title + '. '
58
+ paper_Abstract = paper.section_texts['Abstract']
59
+
60
+ except:
61
+ pass
62
+ text += 'Abstract: ' + paper_Abstract
63
+ openai.api_key = self.api
64
+ messages = [
65
+ {"role": "system",
66
+ "content": f"You are a professional reviewer. "
67
+ f"I will give you a paper. You need to review this paper and discuss the novelty and originality of ideas, correctness, clarity, the significance of results, potential impact and quality of the presentation. "
68
+ f"Due to the length limitations, I am only allowed to provide you the abstract, introduction, conclusion and at most two sections of this paper."
69
+ f"Now I will give you the title and abstract and the headings of potential sections. "
70
+ f"You need to reply at most two headings. Then I will further provide you the full information, includes aforementioned sections and at most two sections you called for.\n\n"
71
+ f"Title: {paper.title}\n\n"
72
+ f"Abstract: {paper_Abstract}\n\n"
73
+ f"Potential Sections: {paper.section_names[2:-1]}\n\n"
74
+ f"Follow the following format to output your choice of sections:"
75
+ f"{{chosen section 1}}, {{chosen section 2}}\n\n"},
76
+ {"role": "user", "content": text},
77
+ ]
78
+ response = openai.ChatCompletion.create(
79
+ model="gpt-3.5-turbo",
80
+ messages=messages,
81
+ )
82
+ result = ''
83
+ for choice in response.choices:
84
+ result += choice.message.content
85
+ # print(result)
86
+ return result.split(',')
87
+
88
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
89
+ stop=tenacity.stop_after_attempt(5),
90
+ reraise=True)
91
+ def chat_review(self, text):
92
+ openai.api_key = self.api # 读取api
93
+ review_prompt_token = 1000
94
+ text_token = len(self.encoding.encode(text))
95
+ input_text_index = int(len(text)*(self.max_token_num-review_prompt_token)/text_token)
96
+ input_text = "This is the paper for your review:" + text[:input_text_index]
97
+ messages=[
98
+ {"role": "system", "content": "You are a professional reviewer. Now I will give you a paper. You need to give a complete review opinion according to the following requirements and format:"+ self.review_format +" Must be output in {}.".format(self.language)},
99
+ {"role": "user", "content": input_text},
100
+ ]
101
+
102
+ response = openai.ChatCompletion.create(
103
+ model="gpt-3.5-turbo",
104
+ messages=messages,
105
+ )
106
+ result = ''
107
+ for choice in response.choices:
108
+ result += choice.message.content
109
+ print("********"*10)
110
+ print(result)
111
+ print("********"*10)
112
+ print("prompt_token_used:", response.usage.prompt_tokens)
113
+ print("completion_token_used:", response.usage.completion_tokens)
114
+ print("total_token_used:", response.usage.total_tokens)
115
+ print("response_time:", response.response_ms/1000.0, 's')
116
+ return result, response.usage.total_tokens
117
+
118
+
119
+ def main(api, review_format, paper_pdf, language):
120
+ start_time = time.time()
121
+ if not api or not review_format or not paper_pdf:
122
+ return "请输入完整内容!"
123
+ # 判断PDF文件
124
+ else:
125
+ paper_list = [Paper(path=paper_pdf)]
126
+ # 创建一个Reader对象
127
+ reviewer1 = Reviewer(api, review_format, paper_pdf, language)
128
+ # 开始判断是路径还是文件:
129
+ comments, total_token_used = reviewer1.review_by_chatgpt(paper_list=paper_list)
130
+ time_used = time.time() - start_time
131
+ output2 ="使用token数:"+ str(total_token_used)+"\n花费时间:"+ str(round(time_used, 2)) +"秒"
132
+ return comments, output2
133
+
134
+
135
+
136
+ ########################################################################################################
137
+ # 标题
138
+ title = "🤖ChatReviewer🤖"
139
+ # 描述
140
+
141
+ description = '''<div align='left'>
142
+ <img align='right' src='http://i.imgtg.com/2023/03/22/94PLN.png' width="270">
143
+
144
+ <strong>ChatReviewer是一款基于ChatGPT-3.5的API开发的论文自动评审AI助手。</strong>其用途如下:
145
+
146
+ ⭐️对论文进行快速总结和评审,提高科研人员的文献阅读和理解的效率,紧跟研究前沿。
147
+
148
+ ⭐️对自己的论文进行评审,根据ChatReviewer生成的审稿意见进行查漏补缺,进一步提高自己的论文质量。
149
+
150
+ ⭐️辅助论文审稿,给出参考意见,提高审稿效率和质量。(🈲:禁止直接复制生成的评论用于任何论文审稿工作!)
151
+
152
+ 如果觉得很卡,可以点击右上角的Duplicate this Space,把ChatReviewer复制到你自己的Space中!
153
+
154
+ 本项目的[Github](https://github.com/nishiwen1214/ChatReviewer),欢迎Star和Fork,也欢迎大佬赞助让本项目快速成长!💗([获取Api Key](https://chatgpt.cn.obiscr.com/blog/posts/2023/How-to-get-api-key/))
155
+ </div>
156
+ '''
157
+
158
+ # 创建Gradio界面
159
+ inp = [gradio.inputs.Textbox(label="请输入你的API-key(sk开头的字符串)",
160
+ default="",
161
+ type='password'),
162
+ gradio.inputs.Textbox(lines=5,
163
+ label="请输入特定的评审要求和格式(否则为默认格式)",
164
+ default="""* Overall Review
165
+ Please briefly summarize the main points and contributions of this paper.
166
+ xxx
167
+ * Paper Strength
168
+ Please provide a list of the strengths of this paper, including but not limited to: innovative and practical methodology, insightful empirical findings or in-depth theoretical analysis,
169
+ well-structured review of relevant literature, and any other factors that may make the paper valuable to readers. (Maximum length: 2,000 characters)
170
+ (1) xxx
171
+ (2) xxx
172
+ (3) xxx
173
+ * Paper Weakness
174
+ Please provide a numbered list of your main concerns regarding this paper (so authors could respond to the concerns individually).
175
+ These may include, but are not limited to: inadequate implementation details for reproducing the study, limited evaluation and ablation studies for the proposed method,
176
+ correctness of the theoretical analysis or experimental results, lack of comparisons or discussions with widely-known baselines in the field, lack of clarity in exposition,
177
+ or any other factors that may impede the reader's understanding or benefit from the paper. Please kindly refrain from providing a general assessment of the paper's novelty without providing detailed explanations. (Maximum length: 2,000 characters)
178
+ (1) xxx
179
+ (2) xxx
180
+ (3) xxx
181
+ * Questions To Authors And Suggestions For Rebuttal
182
+ Please provide a numbered list of specific and clear questions that pertain to the details of the proposed method, evaluation setting, or additional results that would aid in supporting the authors' claims.
183
+ The questions should be formulated in a manner that, after the authors have answered them during the rebuttal, it would enable a more thorough assessment of the paper's quality. (Maximum length: 2,000 characters)
184
+ *Overall score (1-10)
185
+ The paper is scored on a scale of 1-10, with 10 being the full mark, and 6 stands for borderline accept. Then give the reason for your rating.
186
+ xxx"""
187
+ ),
188
+ gradio.inputs.File(label="请上传论文PDF(必填)"),
189
+ gradio.inputs.Radio(choices=["English", "Chinese"],
190
+ default="English",
191
+ label="选择输出语言"),
192
+ ]
193
+
194
+ chat_reviewer_gui = gradio.Interface(fn=main,
195
+ inputs=inp,
196
+ outputs = [gradio.Textbox(lines=25, label="评审结果"), gradio.Textbox(lines=2, label="资源统计")],
197
+ title=title,
198
+ description=description)
199
+
200
+ # Start server
201
+ chat_reviewer_gui .launch(quiet=True, show_api=False)
get_paper_from_pdf.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz, io, os
2
+ from PIL import Image
3
+ from collections import Counter
4
+ import json
5
+ import re
6
+
7
+ class Paper:
8
+ def __init__(self, path, title='', url='', abs='', authors=[]):
9
+ # 初始化函数,根据pdf路径初始化Paper对象
10
+ self.url = url # 文章链接
11
+ self.path = path # pdf路径
12
+ self.section_names = [] # 段落标题
13
+ self.section_texts = {} # 段落内容
14
+ self.abs = abs
15
+ self.title_page = 0
16
+ if title == '':
17
+ self.pdf = fitz.open(self.path) # pdf文档
18
+ self.title = self.get_title()
19
+ self.parse_pdf()
20
+ else:
21
+ self.title = title
22
+ self.authors = authors
23
+ self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
24
+ self.digit_num = [str(d + 1) for d in range(10)]
25
+ self.first_image = ''
26
+
27
+ def parse_pdf(self):
28
+ self.pdf = fitz.open(self.path) # pdf文档
29
+ self.text_list = [page.get_text() for page in self.pdf]
30
+ self.all_text = ' '.join(self.text_list)
31
+ self.extract_section_infomation()
32
+ self.section_texts.update({"title": self.title})
33
+ self.pdf.close()
34
+
35
+ # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
36
+ def get_chapter_names(self, ):
37
+ # # 打开一个pdf文件
38
+ doc = fitz.open(self.path) # pdf文档
39
+ text_list = [page.get_text() for page in doc]
40
+ all_text = ''
41
+ for text in text_list:
42
+ all_text += text
43
+ # # 创建一个空列表,用于存储章节名称
44
+ chapter_names = []
45
+ for line in all_text.split('\n'):
46
+ line_list = line.split(' ')
47
+ if '.' in line:
48
+ point_split_list = line.split('.')
49
+ space_split_list = line.split(' ')
50
+ if 1 < len(space_split_list) < 5:
51
+ if 1 < len(point_split_list) < 5 and (
52
+ point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
53
+ # print("line:", line)
54
+ chapter_names.append(line)
55
+
56
+ return chapter_names
57
+
58
+ def get_title(self):
59
+ doc = self.pdf # 打开pdf文件
60
+ max_font_size = 0 # 初始化最大字体大小为0
61
+ max_string = "" # 初始化最大字体大小对应的字符串为空
62
+ max_font_sizes = [0]
63
+ for page_index, page in enumerate(doc): # 遍历每一页
64
+ text = page.get_text("dict") # 获取页面上的文本信息
65
+ blocks = text["blocks"] # 获取文本块列表
66
+ for block in blocks: # 遍历每个文本块
67
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
68
+ if len(block["lines"][0]["spans"]):
69
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
70
+ max_font_sizes.append(font_size)
71
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
72
+ max_font_size = font_size # 更新最大值
73
+ max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
74
+ max_font_sizes.sort()
75
+ # print("max_font_sizes", max_font_sizes[-10:])
76
+ cur_title = ''
77
+ for page_index, page in enumerate(doc): # 遍历每一页
78
+ text = page.get_text("dict") # 获取页面上的文本信息
79
+ blocks = text["blocks"] # 获取文本块列表
80
+ for block in blocks: # 遍历每个文本块
81
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
82
+ if len(block["lines"][0]["spans"]):
83
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
84
+ font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
85
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
86
+ # print(font_size)
87
+ if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
88
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
89
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
90
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
91
+ if cur_title == '':
92
+ cur_title += cur_string
93
+ else:
94
+ cur_title += ' ' + cur_string
95
+ self.title_page = page_index
96
+ # break
97
+ title = cur_title.replace('\n', ' ')
98
+ return title
99
+
100
+ def extract_section_infomation(self):
101
+ doc = fitz.open(self.path)
102
+
103
+ # 获取文档中所有字体大小
104
+ font_sizes = []
105
+ for page in doc:
106
+ blocks = page.get_text("dict")["blocks"]
107
+ for block in blocks:
108
+ if 'lines' not in block:
109
+ continue
110
+ lines = block["lines"]
111
+ for line in lines:
112
+ for span in line["spans"]:
113
+ font_sizes.append(span["size"])
114
+ most_common_size, _ = Counter(font_sizes).most_common(1)[0]
115
+
116
+ # 按照最频繁的字体大小确定标题字体大小的阈值
117
+ threshold = most_common_size * 1
118
+
119
+ section_dict = {}
120
+ last_heading = None
121
+ subheadings = []
122
+ heading_font = -1
123
+ # 遍历每一页并查找子标题
124
+ found_abstract = False
125
+ upper_heading = False
126
+ font_heading = False
127
+ for page in doc:
128
+ blocks = page.get_text("dict")["blocks"]
129
+ for block in blocks:
130
+ if not found_abstract:
131
+ try:
132
+ text = json.dumps(block)
133
+ except:
134
+ continue
135
+ if re.search(r"\bAbstract\b", text, re.IGNORECASE):
136
+ found_abstract = True
137
+ last_heading = "Abstract"
138
+ section_dict["Abstract"] = ""
139
+ if found_abstract:
140
+ if 'lines' not in block:
141
+ continue
142
+ lines = block["lines"]
143
+ for line in lines:
144
+ for span in line["spans"]:
145
+ # 如果当前文本是子标题
146
+ if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文
147
+ upper_heading = True
148
+ heading = span["text"].strip()
149
+ if "References" in heading: # reference 以后的内容不考虑
150
+ self.section_names = subheadings
151
+ self.section_texts = section_dict
152
+ return
153
+ subheadings.append(heading)
154
+ if last_heading is not None:
155
+ section_dict[last_heading] = section_dict[last_heading].strip()
156
+ section_dict[heading] = ""
157
+ last_heading = heading
158
+ if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断
159
+ r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
160
+ span["text"].strip()):
161
+ font_heading = True
162
+ if heading_font == -1:
163
+ heading_font = span["size"]
164
+ elif heading_font != span["size"]:
165
+ continue
166
+ heading = span["text"].strip()
167
+ if "References" in heading: # reference 以后的内容不考虑
168
+ self.section_names = subheadings
169
+ self.section_texts = section_dict
170
+ return
171
+ subheadings.append(heading)
172
+ if last_heading is not None:
173
+ section_dict[last_heading] = section_dict[last_heading].strip()
174
+ section_dict[heading] = ""
175
+ last_heading = heading
176
+ # 否则将当前文本添加到上一个子标题的文本中
177
+ elif last_heading is not None:
178
+ section_dict[last_heading] += " " + span["text"].strip()
179
+ self.section_names = subheadings
180
+ self.section_texts = section_dict
181
+
182
+
183
+ def main():
184
+ path = r'demo.pdf'
185
+ paper = Paper(path=path)
186
+ paper.parse_pdf()
187
+ # for key, value in paper.section_text_dict.items():
188
+ # print(key, value)
189
+ # print("*"*40)
190
+
191
+
192
+ if __name__ == '__main__':
193
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF==1.21.1
2
+ tiktoken==0.2.0
3
+ tenacity==8.2.2
4
+ pybase64==1.2.3
5
+ Pillow==9.4.0
6
+ openai==0.27.0
7
+ markdown
8
+ gradio==3.20.1