homme.io
Clean.Precise.Quick.
..
PAX ROMANA
SAKURA
Фотография
Философия
Искусство
История
C/C++
DBMS
Oracle
Спорт
Linux
Lua
IT

Infinitum.Aeterna
At Fingertips
Passingly.Photo
2024.01.Китай
Иран в лицах
2023.10.Iran
2023.06.Moscow
2023.04.Istanbul
2023.01.Moscow-XI
2022.11.Moscow-X
2022.11.Турция
2022.10.Царицыно
2022.09.Moscow-IX
2022.08.Зубовка
2022.07.Турция
2022.07.Moscow-VIII
2022.07.ЮЗ-VII
2022.07.Парк_50-IV
2022.07.ЮЗ-VI
2022.06.Moscow-VII
2022.06.Moscow-VI
2022.06.Сокольники
2022.06.Раменское
2022.06.Воронцово
2022.05.Парк_50-III
2022.05.Парк_50-II
2022.05.Архангельское
2022.05.Парк_50
2022.05.Moscow-V
2022.05.Зоопарк
2022.05.Санкт-Петербург
2022.05.Ярославль
2022.04.Moscow-IV
2022.04.Moscow-III
2022.04.Moscow-II
2022.04.Moscow-I
2022.04.ЮЗ-V
2022.03.Зубовка
2022.03.Кокошкино
2022.03.Сочи
2022.03.Воронцово-Центр
2022.02.Царицыно
2022.02.Стамбул
2022.02.Коломенское
2022.02.ЮЗ-Центр ночью
2022.02.ЮЗ-IV
2022.02.ЮЗ-III
2022.02.ЮЗ-II
2022.02.ЮЗ-I
2022.02.ЮЗ ночью
2022.02.Ночной центр
2022.01.Центр
2022.01.Перед экскурсией
2022.01.Боулинг в Высотке
2022.01.Cat
2022.01.Павелецкая-Центр
2021.10.Zubovka
2021.Golden Autumn
2021.Egypt
2021.08.Раменское
2021.07.Лужники-Сити
2021.07.Около Цирка
2021.06.Зарядье
2021.06.В парке
2021.06.Зоопарк
Night.Photo
2021.05.МГУ
2021.Дивеево
2021.Азов
2021.02.Зоопарк
2021.Карелия
2020.Petersburg
2020.Turkey
2020.Anosino
2020.Azov
2020.Vereya
2020.Арктика
2020.Greece
2019.Turkey
2019.Zubovka
2019.Dagestan
2019.Dagestan+
2019.Egypt
2019.Italy
2019.Kulikovo Field
2019.Kaluga
2019.02.Dancing
2019.Baikal
2018.Переславль
2018.Плес
2018.Березка
2018.Крым
2018.Азов
2018.Калининград
2018.Petersburg
2018.Elbrus
2017.Turkey
2015.Egypt
2013.Egypt
2013.Rome

Part II: Neo4j vs Oracle Performance Test

Introduction

Let's random generate 2 million unique Customers [:VISITED] 40000 unique Merchants. Every [:VISIT] has properties: amount (double) and dt (date). Every Customer has property “pty_id” (Integer). And every Merchant has mcht_id (String) property.

One Customer may visit one Merchant for more than one time. And of course, one Customer may visit many Merchants. So there are 43 978 539 relationships in my graph between Customers and Merchants.

The task is:

Get top 10 Customers ordered by total_amount who spent their money at NOT specified Merchant(M) but visit that Merchants which have been visited by Customers who visit this specified Merchant(M)

Hardware

Intel i7 2.3 GHz, SSD; VM (macOS Mojave + Parallels 12): 4 cores, 8 GB RAM

ORACLE Part

Oracle 19C Ent. Edition

SHOW SGA:

Total System Global Area   6777994992 bytes      
Fixed Size                           9150192 bytes      
Variable Size                       4630511616 bytes      
Database Buffers                2130706432 bytes      
Redo Buffers                       7626752 bytes    

select value/1024/1024/1024 from V$PGASTAT where name = 'aggregate PGA target parameter';

3.96875

ORA TEMPORARY SEGMENT = 40 GB

 

as scott:

CREATE TABLE OP_DET(
MCHT_ID VARCHAR2(100)
,PTY_ID NUMBER
,AMOUNT NUMBER
,DT DATE
);

--Generate 2M Customers, 40K Merchants and   ~40+M relationships.

DECLARE
    K NUMBER;
    L NUMBER;
    MCHT VARCHAR2(100);
    TYPE T_TAB IS TABLE OF OP_DET%ROWTYPE;
    TAB T_TAB := T_TAB();
    LIMIT CONSTANT NUMBER := 10000;
    T NUMBER;

BEGIN
/* IF 600 (THIS IS LIMIT VALUE FOR  XE WITH RESTRICTION ON DB SIZE = 12 GB) THEN 
SELECT COUNT(*) FROM OP_DET; ==132 000 671; LS -LAH /ORADATA/  ==5.9G */

FOR Q IN 1..200 LOOP
    TAB.DELETE();
    FOR I IN 1..LIMIT LOOP
        K := ROUND(DBMS_RANDOM.VALUE(1,7));
        FOR J IN 1..K LOOP
            L := ROUND(DBMS_RANDOM.VALUE(1,40000));
            MCHT := LPAD(TRIM(UPPER(TO_CHAR(L,'XXXXXXXXXXX'))),10,'0');
            T := ROUND(DBMS_RANDOM.VALUE(1,10));
            FOR Z IN 1..T LOOP
                TAB.EXTEND();
                TAB(TAB.COUNT).MCHT_ID := MCHT;
                TAB(TAB.COUNT).PTY_ID := I+(Q-1)*LIMIT;
                TAB(TAB.COUNT).AMOUNT := ROUND(DBMS_RANDOM.VALUE(1,10000),2);
                TAB(TAB.COUNT).DT := SYSDATE + ROUND(DBMS_RANDOM.VALUE(1,5)) -
                                               ROUND(DBMS_RANDOM.VALUE(1,5));
            END LOOP;
        END LOOP;
    END LOOP;

    FORALL I IN TAB.FIRST .. TAB.LAST
        INSERT INTO OP_DET VALUES TAB(I);
        COMMIT;
    END LOOP;
END;

PL/SQL procedure successfully completed.

Elapsed: 00:03:34.178
-rw-r-----.  1 oracle oinstall 1.9G Jul  3 11:00 wrkdata.dat

Create indexes

CREATE INDEX OP_DET$PTY_ID ON OP_DET(PTY_ID);

Index OP_DET$PTY_ID created.
Elapsed: 00:00:38.268
-rw-r-----.  1 oracle oinstall 2.7G Jul  3 11:10 wrkdata.dat
CREATE INDEX OP_DET$MCHT_ID ON OP_DET(MCHT_ID);

([linux]$ top: 5429 oracle    20   0 2174560 181652  64392 R  48.8 (%CPU) (%MEM)10.2
2:48.30 oracle_5429_xe )

Index OP_DET$MCHT_ID created.
Elapsed: 00:01:01.954
ls -lah /oradata/
-rw-r-----.  1 oracle oinstall 3.6G (3829407744 bytes) Jul  3 12:04 wrkdata.dat

Compute statistics

analyze table OP_DET compute statistics;

Export customers, merchants and its relationships into CSV-files

as sys:

CREATE DIRECTORY CSV_DIR AS '/gdbms';
GRANT READ ON DIRECTORY CSV_DIR TO SCOTT;
GRANT WRITE ON DIRECTORY CSV_DIR TO SCOTT;

as scott:

--Export distinct PTY_IDs:

DECLARE
  TYPE T_TAB IS TABLE OF PLS_INTEGER;
  RWS T_TAB;
  OUTPUT UTL_FILE.FILE_TYPE;
  FILENAME VARCHAR2(100);
BEGIN
  SELECT DISTINCT (PTY_ID)
  BULK COLLECT INTO RWS
  FROM OP_DET
  --WHERE ROWNUM <= 100
  ;

  FILENAME := 'CUSTOMER.CSV';
  OUTPUT := UTL_FILE.FOPEN ( 'CSV_DIR', FILENAME, 'W' );
  UTL_FILE.PUT_LINE(OUTPUT, 'PTY_ID');
  FOR I IN 1 .. RWS.COUNT LOOP
    UTL_FILE.PUT_LINE ( OUTPUT, RWS(I));
  END LOOP;

  UTL_FILE.FCLOSE(OUTPUT);
END;

PL/SQL procedure successfully completed.
Elapsed: 00:00:13.940
[root@luora oradata]# wc -l CUSTOMER.CSV
2000001 CUSTOMER.CSV

-- Export distinct MCHT_ID

DECLARE
    TYPE T_TAB IS TABLE OF VARCHAR2(100);
    RWS T_TAB;
    OUTPUT UTL_FILE.FILE_TYPE;
    FILENAME VARCHAR2(100);
BEGIN
    SELECT DISTINCT (MCHT_ID)
    BULK COLLECT INTO RWS
    FROM OP_DET
    --WHERE ROWNUM <= 1000
    ;

    FILENAME := 'MERCHANT.CSV';
    OUTPUT := UTL_FILE.FOPEN ( 'CSV_DIR', FILENAME, 'W' );
    UTL_FILE.PUT_LINE(OUTPUT, 'MCHT_ID');
    FOR I IN 1 .. RWS.COUNT LOOP
        UTL_FILE.PUT_LINE ( OUTPUT, RWS(I));
    END LOOP;
    UTL_FILE.FCLOSE(OUTPUT);
END;

PL/SQL procedure successfully completed.
Elapsed: 00:00:03.761
[root@luora oradata]# wc -l MERCHANT.CSV
40001 MERCHANT.CSV

---export relations of customers to merchants i.e. operations (pl/sql tables are not appropriate for this because we may exceed  PGA_AGGREGATE_LIMIT)

DECLARE
    OUTPUT UTL_FILE.FILE_TYPE;
    FILENAME VARCHAR2(100);
BEGIN
    FILENAME := 'OP_DET.CSV';
    EXECUTE IMMEDIATE 'ALTER SESSION SET NLS_DATE_FORMAT=''YYYY-MM-DD''';
    OUTPUT := UTL_FILE.FOPEN ( 'CSV_DIR', FILENAME, 'W' );
    UTL_FILE.PUT_LINE(OUTPUT, 'PTY_ID,MCHT_ID,AMOUNT,DT');
    FOR I IN (SELECT * FROM OP_DET /* WHERE ROWNUM <=1000000 */) LOOP
        UTL_FILE.PUT_LINE ( OUTPUT, I.PTY_ID || ',' || I.MCHT_ID ||',' || I.AMOUNT ||',' || I.DT );
    END LOOP;
    UTL_FILE.FCLOSE(OUTPUT);
END;

PL/SQL procedure successfully completed.
Elapsed: 00:05:08.229
[root@luora oradata]# wc -l OP_DET.CSV
43978540 OP_DET.CSV
[root@luora oradata]# ls -lah
drwxr-xr-x.  2 oracle oinstall  120 Jul  3 12:25 .
dr-xr-xr-x. 19 root   root     4.0K Jul  1 17:28 ..
-rw-r--r--.  1 oracle oinstall  15M Jul  3 12:15 CUSTOMER.CSV
-rw-r--r--.  1 oracle oinstall 430K Jul  3 12:19 MERCHANT.CSV
-rw-r--r--.  1 oracle oinstall 1.6G Jul  3 12:30 OP_DET.CSV
 

Neo4j Part

[root@graph ~]# nano /etc/neo4j/neo4j.conf

# The name of the database to mount
#dbms.active_database=graph.db
dbms.active_database=perf.db

# Paths of directories in the installation.
dbms.directories.data=/gdbms/neo4j/data
dbms.directories.logs=/gdbms/neo4j

#dbms.memory.heap.max_size=512m
dbms.memory.heap.max_size=3584m

#dbms.memory.pagecache.size=10g
dbms.memory.pagecache.size=4g

Import CSV-files into Neo4j

[root@graph ~]# neo4j-admin import --database=perf.db --nodes:Customer=./CUSTOMER.CSV
--nodes:Merchant=./MERCHANT.CSV --relationships:VISITED=./OP_DET.CSV
[root@graph ~]# chown -R neo4j:neo4j /gdbms/neo4j/data/databases/perf.db
[root@graph ~]# cypher-shell

Transform data types to more appropriate:

cypher-shell> call apoc.periodic.iterate("MATCH (c:Customer) return c",
"SET c.pty_id = toInteger(c.pty_id)",{batchSize:100000,parallel:true}) 
yield batches, total return batches, total
cypher-shell> call apoc.periodic.iterate("match (c:Customer)-[r:VISITED]->(m:Merchant) return r",
"set r.amount=toDouble(r.amount)", {batchSize:100000,parallel:true})
yield batches, total return batches, total

Create indexes

cypher-shell> CREATE INDEX on :Customer(pty_id)

cypher-shell> CREATE INDEX  on :Merchant(mcht_id)


Run Tests

All queries return the same data:       

pty_id amount
1433798 348925.94
739510 339169.83
374933 327962.95
1925125 327545.95
768789 327463.18 
1851163 326690.50 
1290014 323256.00
258667 322899.84 
293300 321374.56
1111539 320392.02

 

But time of execution is different:

Oracle Neo4j
1-st run 00:00:08.673 not optimized 413020 ms (1st run) / 210428 ms (2nd run)
2-nd run 00:00:06.654 opt_1 154885 ms (1st run) / 15371 ms (2nd run)
add /*+ PARALLEL(4) */ 00:00:02.197 opt_2 108747 ms (1st run) / 5326 ms (2nd run)

 

Queries are:

SQL:

SELECT
   id, to_char(amnt, '99999999.99') from
(
SELECT
   cust.id, sum(op.amount) amnt
from
(
SELECT
  distinct op3.pty_id as id
FROM
  OP_DET op1
  ,OP_DET op2
  ,OP_DET op3
WHERE
  OP1.MCHT_ID = '0000000DA5'
  AND OP2.PTY_ID = OP1.PTY_ID
  and op2.mcht_id <> '0000000DA5'
  and op2.mcht_id = op3.mcht_id
  and op3.pty_id not in (SELECT distinct PTY_ID FROM OP_DET WHERE MCHT_ID = '0000000DA5')
) cust, op_det op
where op.pty_id = cust.id
group by cust.id
order by amnt desc
)
where rownum <=10
;

-- +parallel hint

SELECT
   id, to_char(amnt, '99999999.99') from
(
SELECT
   cust.id, sum(op.amount) amnt
from
(
SELECT /*+ PARALLEL(4) */
  distinct op3.pty_id as id
FROM
  OP_DET op1
  ,OP_DET op2
  ,OP_DET op3
WHERE
  OP1.MCHT_ID = '0000000DA5'
  AND OP2.PTY_ID = OP1.PTY_ID
  and op2.mcht_id <> '0000000DA5'
  and op2.mcht_id = op3.mcht_id
  and op3.pty_id not in (SELECT distinct PTY_ID FROM OP_DET WHERE MCHT_ID = '0000000DA5')
) cust, op_det op
where op.pty_id = cust.id
group by cust.id
order by amnt desc
)
where rownum <=10
;

CYPHER:

not opt:

MATCH
  (c:Customer)-[r:VISITED]->(mm:Merchant)<-[:VISITED]-(cc:Customer)-[:VISITED]->
  (m:Merchant {mcht_id: "0000000DA5"})
WHERE
  NOT (c)-[:VISITED]->(m)
WITH
  DISTINCT c as uc
MATCH
  (uc:Customer)-[rr:VISITED]->()
RETURN
  uc.pty_id
  ,round(100*sum(rr.amount))/100 as v_amt
ORDER BY v_amt DESC
LIMIT 10;

opt[imixed]_1:

MATCH
  (cc:Customer)-[:VISITED]->(m:Merchant {mcht_id: "0000000DA5"})
WITH m, collect(DISTINCT cc) as visitors
UNWIND visitors as cc
MATCH (uc:Customer)-[:VISITED]->(mm:Merchant)<-[:VISITED]-(cc)
WHERE
  mm <> m
WITH
  DISTINCT visitors, uc
WHERE NOT uc IN visitors
MATCH
  (uc:Customer)-[rr:VISITED]->()
WITH
  uc, round(100*sum(rr.amount))/100 as v_amt
ORDER BY v_amt DESC
LIMIT 10
RETURN uc.pty_id, v_amt;

opt_2:

MATCH (m:Merchant {mcht_id: "0000000DA5"})
CALL apoc.path.expandConfig(m, {uniqueness:'NODE_GLOBAL', relationshipFilter:'VISITED'
,minLevel:3, maxLevel:3}) YIELD path
WITH last(nodes(path)) as uc
MATCH
  (uc:Customer)-[rr:VISITED]->()
WITH
  uc
  ,round(100*sum(rr.amount))/100 as v_amt
ORDER BY v_amt DESC
LIMIT 10
RETURN uc.pty_id, v_amt;

Great thanks for help with Neo4j optimizing to InverseFalcon in this topic.

But  despite all efforts at the moment Oracle wins this performance test!

See also: Part I: Neo4j vs Oracle Performance Test

#neo4j #oracle #performance

Conclusions

Good results for OpenSource project Neo4j but such monster as Oracle once again demonstrated its superiority.

 

Appendix

Neo4j EXPLAINS and PROFILE:

not optimized

opt_1

opt_2

opt_2 profile

 

Oracle Queries Plans:

sdmrnv, 2019-07-01 [5.422ms, s]