Import a huge among of data from Elasticsearch 2 to Mongodb fails on memory limit

0

I need help with import about 25 millions items from Elasticsearch to Mongodb. I wrote php script to do it but when the script reaches 16 millions items it fails on memory limit and throws me an error: VirtualAlloc() failed: [0x000005af] The paging file is too small for this operation to complete. I changed the system settings - virtual memory (paging file) to 100 000 according this web, but it is still not enough. I dont understand why it allocates so much memory. To get data from Elasticsearch I use scroll api. Look at the script:

<?php

error_reporting( E_ALL );
ini_set( 'memory_limit', -1 );
ini_set( 'max_execution_time', -1 );

/** @var \Nette\DI\Container $container */
$container = require( __DIR__ . '/../app/bootstrap.php' );

echo "----------------------------------------------------------------\n";
echo "--------------------- EVENT INDEX IMPORT -----------------------\n";
echo "----------------------------------------------------------------\n";

echo 'memory_limit: ' . ini_get( 'memory_limit' ) . "\n";

/** @var MongoConnect $mongo */
$mongo = $container->getService( 'mongo' );
/** @var \MongoDB\Collection $eventsCollection */
$eventsCollection = $mongo->selectCollection( 'Events', 'events' );

/** @var Elastica\Client $elastic */
$elastic = new Elastica\Client();
/** @var Elastica\Index $elasticIndex */
$elasticScrollData = $elastic->getIndex( 'event' )->request( '_search?scroll=10s', 'GET', ['size' => 250, 'sort' => ['_doc']] )->getData();
$countAll = $elasticScrollData['hits']['total'];

echo 'ES ALL ITEMS COUNT ' . $countAll . "\n";

$offset = 0;
saveToMongo( $elasticScrollData, $countAll, $offset, $elastic, $eventsCollection );


function saveToMongo( $scrollData, $countAll, $offset, \Elastica\Client $elastic, \MongoDB\Collection $mongoCollection )
{
    $documents = [];
    foreach ( $scrollData['hits']['hits'] as $item )
    {
        $doc = [];
        $doc['ico'] = (array)$item['_source']['ico'];
        $doc['data'] = $item['_source'];
        if( isset( $item['_type'] ) ) $doc['type'] = $item['_type'];
        if( isset( $item['_source']['key'] ) ) $doc['key'] = $item['_source']['key'];
        if( isset( $item['_source']['action'] ) ) $doc['action'] = $item['_source']['action'];
        if( isset( $item['_source']['publishDate'] ) ) $doc['publishDate'] = stringToDate( $item['_source']['publishDate'] );
        if( isset( $item['_source']['generateDate'] ) ) $doc['generateDate'] = stringToDate( $item['_source']['generateDate'] );
        if( isset( $item['_source']['eventDate'] ) ) $doc['eventDate'] = stringToDate( $item['_source']['eventDate'] );

        $documents[] = $doc;

        $offset++;
    }

    try
    {
        $mongoCollection->insertMany( $documents, ['ordered' => FALSE] );
        echo '--- offest ' . ( $offset ) . ' OK' . "\n";
    }
    catch( \Exception $e )
    {
        echo '+++ insert exception: ' . $e->getMessage() . "\n";
    }


    if( $offset < $countAll )
    {
        $scrollData = $elastic->request( '_search/scroll', 'GET', ['scroll' => '10s', 'scroll_id' => $scrollData['_scroll_id']] )->getData();
        saveToMongo( $scrollData, $countAll, $offset, $elastic, $mongoCollection );
    }
}


function stringToDate( $string )
{
    if( preg_match( '/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+[\d:]+$/', $string ) ) $format = 'Y-m-d\TH:i:s.uT';
    elseif( preg_match( '/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+$/', $string ) ) $format = 'Y-m-d\TH:i:s.u';
    elseif ( preg_match( '/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+[\d:]+$/', $string ) ) $format = 'Y-m-d\TH:i:sT';
    elseif ( preg_match( '/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/', $string ) ) $format = 'Y-m-d\TH:i:s';
    elseif ( preg_match( '/^\d{4}-\d{2}-\d{2}\+[\d:]+$/', $string ) ) $format = 'Y-m-dT';
    elseif ( preg_match( '/^\d{4}-\d{2}-\d{2}$/', $string ) ) $format = 'Y-m-d';

    return DateTime::createFromFormat( $format, $string );
}


echo "------------------------------------------------------------------------\n";
echo "------------------------- EVERYTHING IS DONE ---------------------------\n";
echo "------------------------------------------------------------------------\n";
mongodb
elasticsearch
memory
import
limit
asked on Stack Overflow Oct 22, 2018 by Čamo • edited Oct 22, 2018 by Čamo

0 Answers

Nobody has answered this question yet.


User contributions licensed under CC BY-SA 3.0