Я работаю с 10-узловым кластером Infinispan, используемым в качестве бэкэнда Hibernate Search. Наши серверы работают на сервере TC 2.5 (tomcat 6.0.32) на Java 1.6_24. Мы используем jGroups 2.12.1.3 для обработки записей кэша кластера с каждого узла и для многоадресной передачи UDP.
Когда мы запускаем 3+ узла в нашем кластере, в конечном итоге один из узлов начинает регистрировать таймауты репликации. Мы наблюдали один и тот же результат, независимо от того, настраиваем ли мы Infinispan для репликации или для режимов кеша распространения. Хотя остальная часть кластера остается стабильной, отказавший узел становится практически непригодным для поиска.
Наша конфигурация:
Infinispan:
<?xml version="1.0" encoding="UTF-8"?>
<infinispan
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="urn:infinispan:config:5.0 http://www.infinispan.org/schemas/infinispan-config-5.0.xsd"
xmlns="urn:infinispan:config:5.0">
<global>
<globalJmxStatistics
enabled="true"
cacheManagerName="HibernateSearch"
allowDuplicateDomains="true" />
<transport
clusterName="HibernateSearch-Infinispan-cluster-MT"
distributedSyncTimeout="50000">
<properties>
<property name="configurationFile" value="infinispan-udp.cfg.xml" />
</properties>
</transport>
<shutdown
hookBehavior="DONT_REGISTER" />
</global>
<default>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<storeAsBinary storeKeysAsBinary="false" storeValuesAsBinary="true"
enabled="false" />
<invocationBatching
enabled="true" />
<clustering
mode="replication">
<stateRetrieval
timeout="60000"
logFlushTimeout="65000"
fetchInMemoryState="true"
alwaysProvideInMemoryState="true" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<jmxStatistics
enabled="true" />
<eviction
maxEntries="-1"
strategy="NONE" />
<expiration
maxIdle="-1" />
</default>
<namedCache
name="LuceneIndexesMetadata">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<loaders shared="true" preload="true">
<loader class="org.infinispan.loaders.jdbm.JdbmCacheStore" fetchPersistentState="false" ignoreModifications="false" purgeOnStartup="false">
<properties>
<property name="location" value="/usr/local/tc/.index/metadata" />
</properties>
</loader>
</loaders>
</namedCache>
<namedCache
name="LuceneIndexesData">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<loaders shared="true" preload="true">
<loader class="org.infinispan.loaders.jdbm.JdbmCacheStore" fetchPersistentState="false" ignoreModifications="false" purgeOnStartup="false">
<properties>
<property name="location" value="/usr/local/tc/.index/data" />
</properties>
</loader>
</loaders>
</namedCache>
<namedCache
name="LuceneIndexesLocking">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
</namedCache>
jGroups (UDP):
<config xmlns="urn:org:jgroups"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="urn:org:jgroups http://www.jgroups.org/schema/JGroups-2.12.xsd">
<UDP
mcast_addr="${jgroups.udp.mcast_addr:228.10.10.9}"
mcast_port="${jgroups.udp.mcast_port:45599}"
tos="8"
ucast_recv_buf_size="20000000"
ucast_send_buf_size="640000"
mcast_recv_buf_size="25000000"
mcast_send_buf_size="640000"
loopback="true"
discard_incompatible_packets="true"
max_bundle_size="64000"
max_bundle_timeout="30"
ip_ttl="${jgroups.udp.ip_ttl:2}"
enable_bundling="true"
enable_diagnostics="false"
thread_naming_pattern="pl"
thread_pool.enabled="true"
thread_pool.min_threads="2"
thread_pool.max_threads="30"
thread_pool.keep_alive_time="5000"
thread_pool.queue_enabled="false"
thread_pool.queue_max_size="100"
thread_pool.rejection_policy="Discard"
oob_thread_pool.enabled="true"
oob_thread_pool.min_threads="2"
oob_thread_pool.max_threads="30"
oob_thread_pool.keep_alive_time="5000"
oob_thread_pool.queue_enabled="false"
oob_thread_pool.queue_max_size="100"
oob_thread_pool.rejection_policy="Discard"
/>
И ошибки, которые мы наблюдаем:
10-31-2011 13:53:02 ERROR Hibernate Search: Directory writer-3 interceptors.InvocationContextInterceptor: ISPN000136: Execution error
org.infinispan.util.concurrent.TimeoutException: Replication timeout for tc-cluster-0105-21082
at org.infinispan.remoting.transport.AbstractTransport.parseResponseAndAddToResponseList(AbstractTransport.java:71)
at org.infinispan.remoting.transport.jgroups.JGroupsTransport.invokeRemotely(JGroupsTransport.java:452)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:132)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:156)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:265)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:252)
at org.infinispan.remoting.rpc.RpcManagerImpl.broadcastRpcCommand(RpcManagerImpl.java:235)
at org.infinispan.remoting.rpc.RpcManagerImpl.broadcastRpcCommand(RpcManagerImpl.java:228)
at org.infinispan.interceptors.ReplicationInterceptor.handleCrudMethod(ReplicationInterceptor.java:116)
at org.infinispan.interceptors.ReplicationInterceptor.visitPutKeyValueCommand(ReplicationInterceptor.java:79)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.LockingInterceptor.visitPutKeyValueCommand(LockingInterceptor.java:294)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.base.CommandInterceptor.handleDefault(CommandInterceptor.java:133)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.TxInterceptor.enlistWriteAndInvokeNext(TxInterceptor.java:214)
at org.infinispan.interceptors.TxInterceptor.visitPutKeyValueCommand(TxInterceptor.java:162)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.CacheMgmtInterceptor.visitPutKeyValueCommand(CacheMgmtInterceptor.java:114)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.InvocationContextInterceptor.handleAll(InvocationContextInterceptor.java:104)
at org.infinispan.interceptors.InvocationContextInterceptor.handleDefault(InvocationContextInterceptor.java:64)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.BatchingInterceptor.handleDefault(BatchingInterceptor.java:77)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.InterceptorChain.invoke(InterceptorChain.java:274)
at org.infinispan.CacheImpl.putIfAbsent(CacheImpl.java:524)
at org.infinispan.CacheSupport.putIfAbsent(CacheSupport.java:74)
at org.infinispan.lucene.locking.BaseLuceneLock.obtain(BaseLuceneLock.java:65)
at org.apache.lucene.store.Lock.obtain(Lock.java:72)
at org.apache.lucene.index.IndexWriter.<init>(IndexWriter.java:1097)
at org.hibernate.search.backend.Workspace.createNewIndexWriter(Workspace.java:202)
at org.hibernate.search.backend.Workspace.getIndexWriter(Workspace.java:180)
at org.hibernate.search.backend.impl.lucene.PerDPQueueProcessor.run(PerDPQueueProcessor.java:103)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:662)
Поскольку эта ошибка настолько распространена, независимо от нашей топологии или режима кэширования, мы считаем, что мы должны быть где-то неправильно настроены. Кто-нибудь может порекомендовать исправление?