imelnyk
diff --git a/‎gpt/utils.py‎
Lines changed: 111 additions & 3 deletions b/‎gpt/utils.py‎
Lines changed: 111 additions & 3 deletions
diff --git a/‎main.py‎
Lines changed: 38 additions & 1 deletion b/‎main.py‎
Lines changed: 38 additions & 1 deletion
diff --git a/‎makevideo.py‎
Lines changed: 94 additions & 33 deletions b/‎makevideo.py‎
Lines changed: 94 additions & 33 deletions
@@ -168,6 +168,114 @@ def gpt_short_verbalizer(files_dir, llm_api, llm_strong, llm_base, logging):
     return gpttext, slides
 
 
+def gpt_qa_verbalizer(files_dir, llm_api, llm_base, matcher, logging):
+
+    encoding = tiktoken.get_encoding("cl100k_base")
+
+    path = os.path.join(files_dir, "original_text_split_pages.txt")
+    with open(path) as f:
+        paper_text = f.read()
+
+    system_message = 'You are a college professor, known for your expert knowledge in deep learning field. ' \
+                     'You are also known for creating very thoughtful and probing questions that examine' \
+                     'the actual knowledge of a student based on their submitted paper. Your goal is to come up with ' \
+                     'a list of questions, both on intuitive level and on deeper technical level that evaluate if ' \
+                     'a student really knows about his or her work. Focus on the knowledge of the main proposed method, ' \
+                     'motivation and results. Make sure your list of questions examine the student thoroughly. ' \
+                     'Ask at least 10 different and diverse questions. ' \
+                     'The questions must cover intuition, main idea and technical details, among others. ' \
+                     'Be extremely specific and ask about details presented in the paper, no generic or abstract questions. '
+
+    human_message = f'Below is the student arxiv paper about which the questions needs to be asked: {paper_text}'
+
+    messages = [
+        {'role': 'system', 'content': system_message},
+        {'role': 'user', 'content': human_message}
+    ]
+
+    for i in range(3):
+        try:
+            response = llm_api(model=llm_base,
+                               messages=messages,
+                               temperature=0,
+                               functions=[
+                                   {
+                                       "name": "ask_questions",
+                                       "description": "ask questions about provided arxiv paper",
+                                       "parameters": {
+                                           "type": "object",
+                                           "properties": {
+                                               "questions": {
+                                                   "type": "array",
+                                                   "description": "the list of questions to be asked",
+                                                   "items": {
+                                                       "type": "string",
+                                                       "description": "individual question, thoughtful and revealing",
+                                                   }
+                                               },
+                                           },
+                                           "required": ["questions"],
+                                       },
+                                   }
+                               ],
+                               function_call="auto",
+                               )
+            break
+        except:
+            time.sleep(5)
+    else:
+        raise Exception(f"{llm_base} failed")
+
+    Qs = json.loads(response.choices[0].message.function_call.arguments)
+
+    system_message = 'You are a student, who wrote this paper. You are on a very important exam. ' \
+                     'You are tasked to explain your work as best as you can. ' \
+                     'You will be provided with a text of the paper, split by pages and a question. ' \
+                     'You must answer the question using information given in the paper. ' \
+                     'The answer should be consice and to the point but still contain details. ' \
+                     'And it should answer the question as best as possible. Be extremly specific. ' \
+                     'Ground your response to the provided paper text. Do NOT use generic or abstract phrases. ' \
+                     'Your career depends on how well you do this job. I will tip you $2000 for an excellent job done. ' \
+                     'Make sure to answer using at least 10 (ten) sentences.'
+
+    answers = []
+    pages = []
+
+    for Q in Qs['questions']:
+        human_message = f'Here is the text of the split by pages: {paper_text}. And here is the question you need to answer: {Q}. ' \
+                        'Make sure your answer best reflects the provided text.'
+
+        messages = [{'role': 'system', 'content': system_message},
+                    {'role': 'user', 'content': human_message}]
+
+        for i in range(3):
+            try:
+                response = llm_api(model=llm_base, messages=messages, temperature=0)
+                break
+            except:
+                time.sleep(5)
+        else:
+            raise Exception(f"{llm_base} failed")
+
+        answer = response.choices[0].message.content
+
+        answer = answer.replace("$", '').replace("```", '').replace("<<", '').replace(">>", '').replace("**", '')
+
+        # remove words with underscores
+        answer = re.sub(r'\b\w*__\w*\b', '', answer)
+
+        answers.append(answer)
+
+        T = paper_text.split('PAGE ')
+        G = answer.split('.')
+        seq = matcher.match(G[:-1], T[1:], minilm=1, bert=1, fuzz=1, spacy=1, diff=1, tfidf=1, pnt=True)
+        # counts = np.bincount(seq)
+        # pages.append(np.argmax(counts))
+        pages.append(seq)
+
+    return Qs['questions'], answers, pages
+
+
 def gpt_textvideo_verbalizer(text, llm_api, llm_strong, llm_base, manual, include_summary, pageblockmap, matcher, logging):
 
     encoding = tiktoken.get_encoding("cl100k_base")
@@ -200,7 +308,7 @@ def gpt_textvideo_verbalizer(text, llm_api, llm_strong, llm_base, manual, includ
 
     for i_s, sec in enumerate(sections):
 
-        # if there is a mismatch, make them of equal length
+        # if there is a mismatch, do this hack to make them of equal length
         if len(sent_tokenize(sec)) != len(pagemap_sections[i_s]):
             minN = min(len(sent_tokenize(sec)), len(pagemap_sections[i_s]))
             cleaned_sent_tok = sent_tokenize(sec)[:minN]
@@ -350,8 +458,8 @@ def gpt_textvideo_verbalizer(text, llm_api, llm_strong, llm_base, manual, includ
         page_inds = []
         curr_upd = []
 
-    if len(sent_tokenize(gpttext_all)) != len(gptpagemap):
-        raise Exception("Something went wrong. Mismatch between map and text")
+        if len(sent_tokenize(gpttext_all)) != len(gptpagemap):
+            raise Exception("Something went wrong. Mismatch between map and text")
 
     return gpttext_all, gptpagemap, verbalizer_steps, textpagemap
 
 
@@ -137,6 +137,25 @@ def main(args):
 
         tmpdata = {'gpttext_short': gpttext_short, 'gptslides_short': slides_short['slides']}
 
+    if args.create_qa:
+        questions, answers, qa_pages = gpt_qa_verbalizer(files_dir, llm_api, args.llm_base, matcher, logging)
+
+        create_questions(questions, os.path.join(files_dir, 'questions'))
+
+        with open(os.path.join(files_dir, 'qa_pages.pkl'), 'wb') as f:
+            pickle.dump(qa_pages, f)
+
+        with open(os.path.join(files_dir, 'gpt_questions_answers.txt'), 'w') as f:
+            for q, a in zip(questions, answers):
+                f.write(f'==== Question ====\n\n')
+                f.write(q)
+                f.write("\n\n")
+                f.write(f'==== Answer ====\n\n')
+                f.write(a)
+                f.write("\n\n")
+
+        tmpdata = {'gpttext_q': questions, 'gpttext_a': answers, 'qa_pages': qa_pages}
+
     if args.create_video:
         (gpttext, gptpagemap,
          verbalizer_steps, textpagemap) = gpt_textvideo_verbalizer(text,
@@ -205,6 +224,23 @@ def main(args):
 
         create_slides(slides_short, os.path.join(files_dir, 'slides'))
 
+    if args.create_qa:
+        with open(os.path.join(files_dir, args.chunk_mp3_file_list), 'w') as mp3_list_file:
+            text_to_speech_qa(questions, answers, mp3_list_file, files_dir, tts_client, args.ffmpeg, logging)
+
+        shutil.copy(os.path.join(files_dir, args.chunk_mp3_file_list),
+                    os.path.join(files_dir, f'qa_{args.chunk_mp3_file_list}'))
+
+        final_audio_qa = os.path.join(files_dir, f'{args.final_audio_file}_qa.mp3')
+        os.system(f'{args.ffmpeg} -f concat -i {os.path.join(files_dir, args.chunk_mp3_file_list)} '
+                  f'-c copy {final_audio_qa} {display}')
+
+        logging.info(f'Created QA audio file')
+
+        if args.gdrive_id:
+            gdrive_client.upload_audio(f'[QA] {title}', f'{final_audio_qa}')
+            logging.info(f'Uploaded QA audio to GDrive')
+
     if args.create_video:
         with open(os.path.join(files_dir, args.chunk_mp3_file_list), 'w') as mp3_list_file:
             text_to_speechvideo(gpttext, mp3_list_file, files_dir, tts_client, gptpagemap, args.voice, logging)
@@ -264,10 +300,11 @@ def main(args):
     parser.add_argument("--extract_text_only", action="store_true", help="extract only the text from paper and exit")
     parser.add_argument("--create_video", action="store_true", help="create long video")
     parser.add_argument("--create_short", action="store_true", help="create short video")
+    parser.add_argument("--create_qa", action="store_true", help="create qa video")
     parser.add_argument("--create_audio_simple", action="store_true", help="create audio")
     parser.add_argument("--openai_key", type=str, default="", help='openai key to call GPT API')
     parser.add_argument("--llm_strong", type=str, default="gpt-4-0125-preview", help='llm model for complex tasks')
-    parser.add_argument("--llm_base", type=str, default="gpt-3.5-turbo-1106", help='llm model for basic tasks')
+    parser.add_argument("--llm_base", type=str, default="gpt-3.5-turbo-0125", help='llm model for basic tasks')
 
     args = parser.parse_args()
 
 
@@ -37,6 +37,7 @@ def main(args):
 
     # Process each line
     for line in lines:
+
         # Remove the newline character at the end of the line
         line = line.strip()
 
@@ -151,51 +152,111 @@ def main(args):
 
     # =============== SHORT VIDEO ====================
 
-    with open(os.path.join(dr, "shorts_mp3_list.txt"), "r") as f:
-        lines = f.readlines()
+    if os.path.exists(os.path.join(dr, "shorts_mp3_list.txt")):
 
-    # create list of chunks
-    outvideo = open(os.path.join(dr, 'short_mp4_list.txt'), 'w')
+        with open(os.path.join(dr, "shorts_mp3_list.txt"), "r") as f:
+            lines = f.readlines()
 
-    # Process each line
-    for page_num, line in enumerate(lines):
-        # Remove the newline character at the end of the line
-        line = line.strip()
+        # create list of chunks
+        outvideo = open(os.path.join(dr, 'short_mp4_list.txt'), 'w')
 
-        # Split the line into components
-        components = line.split()
+        # Process each line
+        for page_num, line in enumerate(lines):
+            # Remove the newline character at the end of the line
+            line = line.strip()
 
-        # The filename is the second component
-        audio = components[1].replace('.mp3', '')
-        video = audio.replace('-', '')
+            # Split the line into components
+            components = line.split()
 
-        # convert to PNG
-        if page_num == 0:
-            input_path = os.path.join(dr, str(page_num))
-        else:
-            input_path = os.path.join(dr, 'slides', f'slide_{page_num}')
+            # The filename is the second component
+            audio = components[1].replace('.mp3', '')
+            video = audio.replace('-', '')
 
-        os.system(f'{args.gs} -sDEVICE=png16m -r500 -o {os.path.join(dr, str(page_num))}.png {input_path}.pdf')
+            # convert to PNG
+            if page_num == 0:
+                input_path = os.path.join(dr, str(page_num))
+            else:
+                input_path = os.path.join(dr, 'slides', f'slide_{page_num}')
 
-        resolution = "scale=1920:-2"
-        os.system(f'{args.ffmpeg} -loop 1 -i {os.path.join(dr, str(page_num))}.png -i {os.path.join(dr, audio)}.mp3 '
-                  f'-vf {resolution} -c:v libx264 -tune stillimage -y -c:a aac -b:a 128k -pix_fmt yuv420p '
-                  f'-shortest {os.path.join(dr, video)}.mp4')
+            os.system(f'{args.gs} -sDEVICE=png16m -r500 -o {os.path.join(dr, str(page_num))}.png {input_path}.pdf')
 
-        # ensure that there is no silence at the end of the video, and video len is the same as audio len
-        os.system(f'audio_duration=$({args.ffprobe} -i {os.path.join(dr, audio)}.mp3 -show_entries format=duration '
-                  f'-v quiet -of csv="p=0"); 'f'audio_duration=$((${{audio_duration%.*}} + 1)); 'f'{args.ffmpeg} '
-                  f'-i {os.path.join(dr, video)}.mp4 -t $audio_duration -y -c copy {os.path.join(dr, video)}_final.mp4')
+            resolution = "scale=1920:-2"
+            os.system(f'{args.ffmpeg} -loop 1 -i {os.path.join(dr, str(page_num))}.png -i {os.path.join(dr, audio)}.mp3 '
+                      f'-vf {resolution} -c:v libx264 -tune stillimage -y -c:a aac -b:a 128k -pix_fmt yuv420p '
+                      f'-shortest {os.path.join(dr, video)}.mp4')
 
-        # list of all chunks
-        outvideo.write(f"file '{video}_final.mp4'\n")
+            # ensure that there is no silence at the end of the video, and video len is the same as audio len
+            os.system(f'audio_duration=$({args.ffprobe} -i {os.path.join(dr, audio)}.mp3 -show_entries format=duration '
+                      f'-v quiet -of csv="p=0"); 'f'audio_duration=$((${{audio_duration%.*}} + 1)); 'f'{args.ffmpeg} '
+                      f'-i {os.path.join(dr, video)}.mp4 -t $audio_duration -y -c copy {os.path.join(dr, video)}_final.mp4')
 
-    outvideo.close()
+            # list of all chunks
+            outvideo.write(f"file '{video}_final.mp4'\n")
 
-    # joint video
-    os.system(f'{args.ffmpeg} -f concat -i {os.path.join(dr, "short_mp4_list.txt")} '
-              f'-y -c copy {os.path.join(dr, "output_short.mp4")}')
+        outvideo.close()
+
+        # joint video
+        os.system(f'{args.ffmpeg} -f concat -i {os.path.join(dr, "short_mp4_list.txt")} '
+                  f'-y -c copy {os.path.join(dr, "output_short.mp4")}')
+
+    # =============== QA VIDEO ====================
+
+    if os.path.exists(os.path.join(dr, "qa_mp3_list.txt")):
+
+        with open(os.path.join(dr, "qa_mp3_list.txt"), "r") as f:
+            lines = f.readlines()
+
+        # create list of chunks
+        outvideo = open(os.path.join(dr, 'qa_mp4_list.txt'), 'w')
+
+        qa_pages = pickle.load(open(os.path.join(dr, 'qa_pages.pkl'), 'rb'))
+
+        # Process each line
+        turn = -1
+        for line_num, line in enumerate(lines):
+            # Remove the newline character at the end of the line
+            line = line.strip()
+
+            # Split the line into components
+            components = line.split()
+
+            # The filename is the second component
+            audio = components[1].replace('.mp3', '')
+            video = audio.replace('-', '')
+
+            # convert to PNG
+            if 'question' in audio:  # question - get created slide
+                turn += 1
+                page_num = 0
+                input_path = os.path.join(dr, 'questions', f'question_{turn}')
+            else:  # answer - get single page from paper
+                p_num = qa_pages[turn][page_num]
+                # extract the page from PDF
+                os.system(f'{args.gs} -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -dFirstPage={p_num+1} -dLastPage={p_num+1} -sOutputFile={os.path.join(dr, str(p_num))}.pdf {os.path.join(dr, "main.pdf")} > /dev/null 2>&1')
+                input_path = os.path.join(dr, f'{p_num}')
+                page_num += 1
+
+            qa_page = 'qa_page.png'
+            os.system(f'{args.gs} -sDEVICE=png16m -r500 -o {os.path.join(dr, qa_page)} {input_path}.pdf')
+
+            resolution = "scale=1920:-2"
+            os.system(f'{args.ffmpeg} -loop 1 -i {os.path.join(dr, qa_page)} -i {os.path.join(dr, audio)}.mp3 '
+                      f'-vf {resolution} -c:v libx264 -tune stillimage -y -c:a aac -b:a 128k -pix_fmt yuv420p '
+                      f'-shortest {os.path.join(dr, video)}.mp4')
+
+            # ensure that there is no silence at the end of the video, and video len is the same as audio len
+            os.system(f'audio_duration=$({args.ffprobe} -i {os.path.join(dr, audio)}.mp3 -show_entries format=duration '
+                      f'-v quiet -of csv="p=0"); 'f'audio_duration=$((${{audio_duration%.*}} + 1)); 'f'{args.ffmpeg} '
+                      f'-i {os.path.join(dr, video)}.mp4 -t $audio_duration -y -c copy {os.path.join(dr, video)}_final.mp4')
+
+            # list of all chunks
+            outvideo.write(f"file '{video}_final.mp4'\n")
+
+        outvideo.close()
 
+        # joint video
+        os.system(f'{args.ffmpeg} -f concat -i {os.path.join(dr, "qa_mp4_list.txt")} '
+                  f'-y -c copy {os.path.join(dr, "output_qa.mp4")}')
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Arguments')