Head's Up! These forums are read-only. All users and content have migrated. Please join us at community.neo4j.com.
02-27-2022 10:05 AM
Hello, I have trouble adding properties to nodes. I'm currently working with this tutorial: Create a graph database in Neo4j using Python | by CJ Sullivan | Towards Data Science but as I am completely new to Neo4j, I do not know how to add properties to nodes. I'm working on the arXiv dataset (arXiv Dataset | Kaggle) and I wanted to add the creation date of papers as a property to the paper nodes in the graph. Here is my notebook (I'm using Neo4j Blank Sandboxes for node display):
pip install neo4j
pip install pandas
import neo4j
from neo4j import GraphDatabase
import pandas as pd
import json
import time
from datetime import datetime
data = 'arXiv-data/test.json'
metadata = []
lines = 149
with open(data, 'r') as f:
for line in f:
metadata.append(json.loads(line))
lines -= 1
if lines == 0: break
df = pd.DataFrame(metadata)
df.dtypes
def get_author_list(line):
# Cleans author dataframe column, creating a list of authors in the row.
return [e[1] + ' ' + e[0] for e in line]
df['cleaned_authors_list'] = df['authors_parsed'].map(get_author_list)
df['created_date'] = [datetime.strptime(date[0]['created'].split(',')[1],' %d %b %Y %H:%M:%S %Z')
for date in df['versions']]
df.drop(['submitter', 'authors', 'title', 'journal-ref', 'doi', 'report-no', 'comments', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'], axis = 1)
class Neo4jConnection:
def __init__(self, uri, user, pwd):
self.__uri = uri
self.__user = user
self.__pwd = pwd
self.__driver = None
try:
self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
except Exception as e:
print("Failed to create the driver:", e)
def close(self):
if self.__driver is not None:
self.__driver.close()
def query(self, query, parameters=None, db=None):
assert self.__driver is not None, "Driver not initialized!"
session = None
response = None
try:
session = self.__driver.session(database=db) if db is not None else self.__driver.session()
response = list(session.run(query, parameters))
except Exception as e:
print("Query failed:", e)
finally:
if session is not None:
session.close()
return response
conn = Neo4jConnection(uri="bolt://44.197.113.107:7687", user="neo4j",pwd="delight-hardships-mitt")
conn.query('CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper) ASSERT p.id IS UNIQUE')
conn.query('CREATE CONSTRAINT authors IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE')
def add_authors(rows, batch_size=10000):
# Adds author nodes to the Neo4j graph as a batch job.
query = '''
UNWIND $rows AS row
MERGE (:Author {name: row.author})
RETURN count(*) as total
'''
return insert_data(query, rows, batch_size)
def insert_data(query, rows, batch_size = 10000):
# Function to handle the updating the Neo4j database in batch mode.
total = 0
batch = 0
start = time.time()
result = None
while batch * batch_size < len(rows):
res = conn.query(query,
parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
total += res[0]['total']
batch += 1
result = {"total":total,
"batches":batch,
"time":time.time()-start}
print(result)
return result
def add_papers(rows, batch_size=50):
# Adds paper nodes and (:Author)--(:Paper) relationships to the Neo4j graph as a
# batch job.
query = '''
UNWIND $rows as row
MERGE (p:Paper {id:row.id}) ON CREATE SET p.title = row.title, p.date = row.created_date
// connect authors
WITH distinct row, p // reduce cardinality
UNWIND row.cleaned_authors_list AS author
MATCH (a:Author {name: author})
MERGE (a)-[:AUTHORED]->(p)
RETURN count(distinct p) as total
'''
return insert_data(query, rows, batch_size)
authors = pd.DataFrame(df[['cleaned_authors_list']])
authors.rename(columns={'cleaned_authors_list':'author'},
inplace=True)
authors=authors.explode('author').drop_duplicates(subset=['author'])
add_authors(authors)
add_papers(df)
After trying to run the notebook, I get the following error message:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [10], in <module>
4 authors=authors.explode('author').drop_duplicates(subset=['author'])
6 add_authors(authors)
----> 7 add_papers(df)
Input In [9], in add_papers(rows, batch_size)
1 def add_papers(rows, batch_size=50):
2 # Adds paper nodes and (:Author)--(:Paper) relationships to the Neo4j graph as a
3 # batch job.
5 query = '''
6 UNWIND $rows as row
7 MERGE (p:Paper {id:row.id}) ON CREATE
(...)
16 RETURN count(distinct p) as total
17 '''
---> 19 return insert_data(query, rows, batch_size)
Input In [8], in insert_data(query, rows, batch_size)
19 while batch * batch_size < len(rows):
21 res = conn.query(query,
22 parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
---> 23 total += res[0]['total']
24 batch += 1
25 result = {"total":total,
26 "batches":batch,
27 "time":time.time()-start}
TypeError: 'NoneType' object is not subscriptable
Any help would be highly appreciated!
03-04-2022 06:24 AM
You can leverage a python ingest utility like this ( neo4j-field/pyingest (github.com)).
This will reduce the need to write custom code to ingest the data.
All the sessions of the conference are now available online