22import requests
33
44from dotenv import load_dotenv
5- from bs4 import BeautifulSoup
65from langchain .embeddings import OllamaEmbeddings , OpenAIEmbeddings
76from langchain .graphs import Neo4jGraph
87
@@ -43,6 +42,7 @@ def create_constraints():
4342
4443create_constraints ()
4544
45+
4646def create_vector_index (dimension ):
4747 # TODO use Neo4jVector Code from LangChain on the existing graph
4848 index_query = "CALL db.index.vector.createNodeIndex('stackoverflow', 'Question', 'embedding', $dimension, 'cosine')"
@@ -56,27 +56,22 @@ def create_vector_index(dimension):
5656
5757
5858def load_so_data (tag : str = "neo4j" , page : int = 1 ) -> None :
59- base_url = "https://api.stackexchange.com/2.2/questions "
59+ base_url = "https://api.stackexchange.com/2.3/search/advanced "
6060 parameters = (
61- f"?pagesize=100&page={ page } &order=desc&sort=creation&tagged={ tag } "
62- "&site=stackoverflow&filter=!6WPIomnMNcVD9 "
61+ f"?pagesize=100&page={ page } &order=desc&sort=creation&answers=1& tagged={ tag } "
62+ "&site=stackoverflow&filter=!51dU0b1n(WTdqj5MH1iGsNShY6BhXXwJ)xwV5b "
6363 )
6464 data = requests .get (base_url + parameters ).json ()
6565 # Convert html to text and calculate embedding values
6666 for q in data ["items" ]:
67- question_text = BeautifulSoup (q ["body" ], features = "html.parser" ).text
68- q ["body" ] = question_text
69- q ["embedding" ] = embeddings .embed_query (q ["title" ] + " " + question_text )
70- if q .get ("answers" ):
71- for a in q .get ("answers" ):
72- a ["body" ] = BeautifulSoup (a ["body" ], features = "html.parser" ).text
67+ q ["embedding" ] = embeddings .embed_query (q ["title" ] + " " + q ["body_markdown" ])
7368
7469 import_query = """
7570 UNWIND $data AS q
7671 MERGE (question:Question {id:q.question_id})
7772 ON CREATE SET question.title = q.title, question.link = q.link,
78- question.favorite_count = q.favorite_count, question.creation_date = q.creation_date,
79- question.body = q.body , question.embedding = q.embedding
73+ question.favorite_count = q.favorite_count, question.creation_date = datetime({epochSeconds: q.creation_date}) ,
74+ question.body = q.body_markdown , question.embedding = q.embedding
8075 FOREACH (tagName IN q.tags |
8176 MERGE (tag:Tag {name:tagName})
8277 MERGE (question)-[:TAGGED]->(tag)
@@ -85,8 +80,8 @@ def load_so_data(tag: str = "neo4j", page: int = 1) -> None:
8580 MERGE (question)<-[:ANSWERS]-(answer:Answer {id:a.answer_id})
8681 SET answer.is_accepted = a.is_accepted,
8782 answer.score = a.score,
88- answer.creation_date = a.creation_date,
89- answer.body = a.body
83+ answer.creation_date = datetime({epochSeconds: a.creation_date}) ,
84+ answer.body = a.body_markdown
9085 MERGE (answerer:User {id:coalesce(a.owner.user_id, "deleted")})
9186 ON CREATE SET answerer.display_name = a.owner.display_name,
9287 answerer.reputation= a.owner.reputation
@@ -103,19 +98,24 @@ def load_so_data(tag: str = "neo4j", page: int = 1) -> None:
10398
10499# Streamlit
105100def get_tag () -> str :
106- input_text = st .text_input ("Which tag questions do you want to import?" , value = "neo4j" )
101+ input_text = st .text_input (
102+ "Which tag questions do you want to import?" , value = "neo4j"
103+ )
107104 return input_text
108105
109106
110107def get_pages ():
111108 col1 , col2 = st .columns (2 )
112109 with col1 :
113- num_pages = st .number_input ("Number of pages (100 questions per page)" , step = 1 , min_value = 1 )
110+ num_pages = st .number_input (
111+ "Number of pages (100 questions per page)" , step = 1 , min_value = 1
112+ )
114113 with col2 :
115114 start_page = st .number_input ("Start page" , step = 1 , min_value = 1 )
116115 st .caption ("Only questions with answers will be imported." )
117116 return (int (num_pages ), int (start_page ))
118117
118+
119119st .header ("StackOverflow Loader" )
120120st .subheader ("Choose StackOverflow tags to load into Neo4j" )
121121st .caption ("Go to http://localhost:7474/browser/ to explore the graph." )
@@ -127,7 +127,7 @@ def get_pages():
127127 with st .spinner ("Loading... This might take a minute or two." ):
128128 try :
129129 for page in range (1 , num_pages + 1 ):
130- load_so_data (user_input , start_page + (page - 1 ))
130+ load_so_data (user_input , start_page + (page - 1 ))
131131 st .success ("Import successful" , icon = "✅" )
132132 except Exception as e :
133133 st .error (f"Error: { e } " , icon = "🚨" )
0 commit comments